diff options
author | Dongjoon Hyun <dongjoon@apache.org> | 2016-04-14 13:34:29 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-04-14 13:34:29 -0700 |
commit | d7e124edfe2578ecdf8e816a4dda3ce430a09172 (patch) | |
tree | e7a6dc3bbc06803b10c183977d3588383039b01d | |
parent | bc748b7b8f3b5aee28aff9ea078c216ca137a5b7 (diff) | |
download | spark-d7e124edfe2578ecdf8e816a4dda3ce430a09172.tar.gz spark-d7e124edfe2578ecdf8e816a4dda3ce430a09172.tar.bz2 spark-d7e124edfe2578ecdf8e816a4dda3ce430a09172.zip |
[SPARK-14545][SQL] Improve `LikeSimplification` by adding `a%b` rule
## What changes were proposed in this pull request?
Current `LikeSimplification` handles the following four rules.
- 'a%' => expr.StartsWith("a")
- '%b' => expr.EndsWith("b")
- '%a%' => expr.Contains("a")
- 'a' => EqualTo("a")
This PR adds the following rule.
- 'a%b' => expr.Length() >= 2 && expr.StartsWith("a") && expr.EndsWith("b")
Here, 2 is statically calculated from "a".size + "b".size.
**Before**
```
scala> sql("select a from (select explode(array('abc','adc')) a) T where a like 'a%c'").explain()
== Physical Plan ==
WholeStageCodegen
: +- Filter a#5 LIKE a%c
: +- INPUT
+- Generate explode([abc,adc]), false, false, [a#5]
+- Scan OneRowRelation[]
```
**After**
```
scala> sql("select a from (select explode(array('abc','adc')) a) T where a like 'a%c'").explain()
== Physical Plan ==
WholeStageCodegen
: +- Filter ((length(a#5) >= 2) && (StartsWith(a#5, a) && EndsWith(a#5, c)))
: +- INPUT
+- Generate explode([abc,adc]), false, false, [a#5]
+- Scan OneRowRelation[]
```
## How was this patch tested?
Pass the Jenkins tests (including new testcase).
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #12312 from dongjoon-hyun/SPARK-14545.
2 files changed, 31 insertions, 11 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index aeb1842677..f5172b213a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -517,22 +517,28 @@ object LikeSimplification extends Rule[LogicalPlan] { // Cases like "something\%" are not optimized, but this does not affect correctness. private val startsWith = "([^_%]+)%".r private val endsWith = "%([^_%]+)".r + private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r private val contains = "%([^_%]+)%".r private val equalTo = "([^_%]*)".r def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { - case Like(l, Literal(utf, StringType)) => - utf.toString match { - case startsWith(pattern) if !pattern.endsWith("\\") => - StartsWith(l, Literal(pattern)) - case endsWith(pattern) => - EndsWith(l, Literal(pattern)) - case contains(pattern) if !pattern.endsWith("\\") => - Contains(l, Literal(pattern)) - case equalTo(pattern) => - EqualTo(l, Literal(pattern)) + case Like(input, Literal(pattern, StringType)) => + pattern.toString match { + case startsWith(prefix) if !prefix.endsWith("\\") => + StartsWith(input, Literal(prefix)) + case endsWith(postfix) => + EndsWith(input, Literal(postfix)) + // 'a%a' pattern is basically same with 'a%' && '%a'. + // However, the additional `Length` condition is required to prevent 'a' match 'a%a'. + case startsAndEndsWith(prefix, postfix) if !prefix.endsWith("\\") => + And(GreaterThanOrEqual(Length(input), Literal(prefix.size + postfix.size)), + And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix)))) + case contains(infix) if !infix.endsWith("\\") => + Contains(input, Literal(infix)) + case equalTo(str) => + EqualTo(input, Literal(str)) case _ => - Like(l, Literal.create(utf, StringType)) + Like(input, Literal.create(pattern, StringType)) } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala index 741bc113cf..fdde89d079 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala @@ -61,6 +61,20 @@ class LikeSimplificationSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + test("simplify Like into startsWith and EndsWith") { + val originalQuery = + testRelation + .where(('a like "abc\\%def") || ('a like "abc%def")) + + val optimized = Optimize.execute(originalQuery.analyze) + val correctAnswer = testRelation + .where(('a like "abc\\%def") || + (Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def")))) + .analyze + + comparePlans(optimized, correctAnswer) + } + test("simplify Like into Contains") { val originalQuery = testRelation |