aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst
diff options
context:
space:
mode:
authorMichael Armbrust <michael@databricks.com>2014-07-08 10:36:18 -0700
committerMichael Armbrust <michael@databricks.com>2014-07-08 10:38:30 -0700
commit4bf8ddaeefb5e078bfd163d3a6041b9c53b929df (patch)
treea2020330d2981ebd9b36c74ee47a1da854b19835 /sql/catalyst
parent3e95225c7c7cf3cf72b32e4c691333d0d663cec6 (diff)
downloadspark-4bf8ddaeefb5e078bfd163d3a6041b9c53b929df.tar.gz
spark-4bf8ddaeefb5e078bfd163d3a6041b9c53b929df.tar.bz2
spark-4bf8ddaeefb5e078bfd163d3a6041b9c53b929df.zip
[SPARK-2395][SQL] Optimize common LIKE patterns.
Author: Michael Armbrust <michael@databricks.com> Closes #1325 from marmbrus/slowLike and squashes the following commits: 023c3eb [Michael Armbrust] add comment. 8b421c2 [Michael Armbrust] Handle the case where the final % is actually escaped. d34d37e [Michael Armbrust] add periods. 3bbf35f [Michael Armbrust] Roll back changes to SparkBuild 53894b1 [Michael Armbrust] Fix grammar. 4094462 [Michael Armbrust] Fix grammar. 6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns. (cherry picked from commit cc3e0a14daf756ff5c2d4e7916438e175046e5bb) Signed-off-by: Michael Armbrust <michael@databricks.com>
Diffstat (limited to 'sql/catalyst')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala51
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala23
2 files changed, 74 insertions, 0 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index c074b7bb01..347471cebd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -156,3 +156,54 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE
override def toString() = s"Lower($child)"
}
+
+/** A base class for functions that compare two strings, returning a boolean. */
+abstract class StringComparison extends Expression {
+ self: Product =>
+
+ type EvaluatedType = Any
+
+ def left: Expression
+ def right: Expression
+
+ override def references = children.flatMap(_.references).toSet
+ override def children = left :: right :: Nil
+
+ override def nullable: Boolean = true
+ override def dataType: DataType = BooleanType
+
+ def compare(l: String, r: String): Boolean
+
+ override def eval(input: Row): Any = {
+ val leftEval = left.eval(input).asInstanceOf[String]
+ if(leftEval == null) {
+ null
+ } else {
+ val rightEval = right.eval(input).asInstanceOf[String]
+ if (rightEval == null) null else compare(leftEval, rightEval)
+ }
+ }
+
+ override def toString() = s"$nodeName($left, $right)"
+}
+
+/**
+ * A function that returns true if the string `left` contains the string `right`.
+ */
+case class Contains(left: Expression, right: Expression) extends StringComparison {
+ override def compare(l: String, r: String) = l.contains(r)
+}
+
+/**
+ * A function that returns true if the string `left` starts with the string `right`.
+ */
+case class StartsWith(left: Expression, right: Expression) extends StringComparison {
+ def compare(l: String, r: String) = l.startsWith(r)
+}
+
+/**
+ * A function that returns true if the string `left` ends with the string `right`.
+ */
+case class EndsWith(left: Expression, right: Expression) extends StringComparison {
+ def compare(l: String, r: String) = l.endsWith(r)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 5bc478484c..17c80b6ea4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] {
Batch("ConstantFolding", FixedPoint(100),
NullPropagation,
ConstantFolding,
+ LikeSimplification,
BooleanSimplification,
SimplifyFilters,
SimplifyCasts,
@@ -112,6 +113,28 @@ object ColumnPruning extends Rule[LogicalPlan] {
}
/**
+ * Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition.
+ * For example, when the expression is just checking to see if a string starts with a given
+ * pattern.
+ */
+object LikeSimplification extends Rule[LogicalPlan] {
+ // if guards below protect from escapes on trailing %.
+ // Cases like "something\%" are not optimized, but this does not affect correctness.
+ val startsWith = "([^_%]+)%".r
+ val endsWith = "%([^_%]+)".r
+ val contains = "%([^_%]+)%".r
+
+ def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+ case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") =>
+ StartsWith(l, Literal(pattern))
+ case Like(l, Literal(endsWith(pattern), StringType)) =>
+ EndsWith(l, Literal(pattern))
+ case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") =>
+ Contains(l, Literal(pattern))
+ }
+}
+
+/**
* Replaces [[Expression Expressions]] that can be statically evaluated with
* equivalent [[Literal]] values. This rule is more specific with
* Null value propagation from bottom to top of the expression tree.