aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDongjoon Hyun <dongjoon@apache.org>2016-04-14 13:34:29 -0700
committerReynold Xin <rxin@databricks.com>2016-04-14 13:34:29 -0700
commitd7e124edfe2578ecdf8e816a4dda3ce430a09172 (patch)
treee7a6dc3bbc06803b10c183977d3588383039b01d
parentbc748b7b8f3b5aee28aff9ea078c216ca137a5b7 (diff)
downloadspark-d7e124edfe2578ecdf8e816a4dda3ce430a09172.tar.gz
spark-d7e124edfe2578ecdf8e816a4dda3ce430a09172.tar.bz2
spark-d7e124edfe2578ecdf8e816a4dda3ce430a09172.zip
[SPARK-14545][SQL] Improve `LikeSimplification` by adding `a%b` rule
## What changes were proposed in this pull request? Current `LikeSimplification` handles the following four rules. - 'a%' => expr.StartsWith("a") - '%b' => expr.EndsWith("b") - '%a%' => expr.Contains("a") - 'a' => EqualTo("a") This PR adds the following rule. - 'a%b' => expr.Length() >= 2 && expr.StartsWith("a") && expr.EndsWith("b") Here, 2 is statically calculated from "a".size + "b".size. **Before** ``` scala> sql("select a from (select explode(array('abc','adc')) a) T where a like 'a%c'").explain() == Physical Plan == WholeStageCodegen : +- Filter a#5 LIKE a%c : +- INPUT +- Generate explode([abc,adc]), false, false, [a#5] +- Scan OneRowRelation[] ``` **After** ``` scala> sql("select a from (select explode(array('abc','adc')) a) T where a like 'a%c'").explain() == Physical Plan == WholeStageCodegen : +- Filter ((length(a#5) >= 2) && (StartsWith(a#5, a) && EndsWith(a#5, c))) : +- INPUT +- Generate explode([abc,adc]), false, false, [a#5] +- Scan OneRowRelation[] ``` ## How was this patch tested? Pass the Jenkins tests (including new testcase). Author: Dongjoon Hyun <dongjoon@apache.org> Closes #12312 from dongjoon-hyun/SPARK-14545.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala28
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala14
2 files changed, 31 insertions, 11 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index aeb1842677..f5172b213a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -517,22 +517,28 @@ object LikeSimplification extends Rule[LogicalPlan] {
// Cases like "something\%" are not optimized, but this does not affect correctness.
private val startsWith = "([^_%]+)%".r
private val endsWith = "%([^_%]+)".r
+ private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r
private val contains = "%([^_%]+)%".r
private val equalTo = "([^_%]*)".r
def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
- case Like(l, Literal(utf, StringType)) =>
- utf.toString match {
- case startsWith(pattern) if !pattern.endsWith("\\") =>
- StartsWith(l, Literal(pattern))
- case endsWith(pattern) =>
- EndsWith(l, Literal(pattern))
- case contains(pattern) if !pattern.endsWith("\\") =>
- Contains(l, Literal(pattern))
- case equalTo(pattern) =>
- EqualTo(l, Literal(pattern))
+ case Like(input, Literal(pattern, StringType)) =>
+ pattern.toString match {
+ case startsWith(prefix) if !prefix.endsWith("\\") =>
+ StartsWith(input, Literal(prefix))
+ case endsWith(postfix) =>
+ EndsWith(input, Literal(postfix))
+ // 'a%a' pattern is basically same with 'a%' && '%a'.
+ // However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
+ case startsAndEndsWith(prefix, postfix) if !prefix.endsWith("\\") =>
+ And(GreaterThanOrEqual(Length(input), Literal(prefix.size + postfix.size)),
+ And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix))))
+ case contains(infix) if !infix.endsWith("\\") =>
+ Contains(input, Literal(infix))
+ case equalTo(str) =>
+ EqualTo(input, Literal(str))
case _ =>
- Like(l, Literal.create(utf, StringType))
+ Like(input, Literal.create(pattern, StringType))
}
}
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
index 741bc113cf..fdde89d079 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
@@ -61,6 +61,20 @@ class LikeSimplificationSuite extends PlanTest {
comparePlans(optimized, correctAnswer)
}
+ test("simplify Like into startsWith and EndsWith") {
+ val originalQuery =
+ testRelation
+ .where(('a like "abc\\%def") || ('a like "abc%def"))
+
+ val optimized = Optimize.execute(originalQuery.analyze)
+ val correctAnswer = testRelation
+ .where(('a like "abc\\%def") ||
+ (Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def"))))
+ .analyze
+
+ comparePlans(optimized, correctAnswer)
+ }
+
test("simplify Like into Contains") {
val originalQuery =
testRelation