From 4bf8ddaeefb5e078bfd163d3a6041b9c53b929df Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Tue, 8 Jul 2014 10:36:18 -0700 Subject: [SPARK-2395][SQL] Optimize common LIKE patterns. Author: Michael Armbrust Closes #1325 from marmbrus/slowLike and squashes the following commits: 023c3eb [Michael Armbrust] add comment. 8b421c2 [Michael Armbrust] Handle the case where the final % is actually escaped. d34d37e [Michael Armbrust] add periods. 3bbf35f [Michael Armbrust] Roll back changes to SparkBuild 53894b1 [Michael Armbrust] Fix grammar. 4094462 [Michael Armbrust] Fix grammar. 6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns. (cherry picked from commit cc3e0a14daf756ff5c2d4e7916438e175046e5bb) Signed-off-by: Michael Armbrust --- .../catalyst/expressions/stringOperations.scala | 51 ++++++++++++++++++++++ .../spark/sql/catalyst/optimizer/Optimizer.scala | 23 ++++++++++ 2 files changed, 74 insertions(+) (limited to 'sql/catalyst') diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index c074b7bb01..347471cebd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -156,3 +156,54 @@ case class Lower(child: Expression) extends UnaryExpression with CaseConversionE override def toString() = s"Lower($child)" } + +/** A base class for functions that compare two strings, returning a boolean. */ +abstract class StringComparison extends Expression { + self: Product => + + type EvaluatedType = Any + + def left: Expression + def right: Expression + + override def references = children.flatMap(_.references).toSet + override def children = left :: right :: Nil + + override def nullable: Boolean = true + override def dataType: DataType = BooleanType + + def compare(l: String, r: String): Boolean + + override def eval(input: Row): Any = { + val leftEval = left.eval(input).asInstanceOf[String] + if(leftEval == null) { + null + } else { + val rightEval = right.eval(input).asInstanceOf[String] + if (rightEval == null) null else compare(leftEval, rightEval) + } + } + + override def toString() = s"$nodeName($left, $right)" +} + +/** + * A function that returns true if the string `left` contains the string `right`. + */ +case class Contains(left: Expression, right: Expression) extends StringComparison { + override def compare(l: String, r: String) = l.contains(r) +} + +/** + * A function that returns true if the string `left` starts with the string `right`. + */ +case class StartsWith(left: Expression, right: Expression) extends StringComparison { + def compare(l: String, r: String) = l.startsWith(r) +} + +/** + * A function that returns true if the string `left` ends with the string `right`. + */ +case class EndsWith(left: Expression, right: Expression) extends StringComparison { + def compare(l: String, r: String) = l.endsWith(r) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 5bc478484c..17c80b6ea4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] { Batch("ConstantFolding", FixedPoint(100), NullPropagation, ConstantFolding, + LikeSimplification, BooleanSimplification, SimplifyFilters, SimplifyCasts, @@ -111,6 +112,28 @@ object ColumnPruning extends Rule[LogicalPlan] { } } +/** + * Simplifies LIKE expressions that do not need full regular expressions to evaluate the condition. + * For example, when the expression is just checking to see if a string starts with a given + * pattern. + */ +object LikeSimplification extends Rule[LogicalPlan] { + // if guards below protect from escapes on trailing %. + // Cases like "something\%" are not optimized, but this does not affect correctness. + val startsWith = "([^_%]+)%".r + val endsWith = "%([^_%]+)".r + val contains = "%([^_%]+)%".r + + def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { + case Like(l, Literal(startsWith(pattern), StringType)) if !pattern.endsWith("\\") => + StartsWith(l, Literal(pattern)) + case Like(l, Literal(endsWith(pattern), StringType)) => + EndsWith(l, Literal(pattern)) + case Like(l, Literal(contains(pattern), StringType)) if !pattern.endsWith("\\") => + Contains(l, Literal(pattern)) + } +} + /** * Replaces [[Expression Expressions]] that can be statically evaluated with * equivalent [[Literal]] values. This rule is more specific with -- cgit v1.2.3