aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2014-11-17 15:33:13 -0800
committerMichael Armbrust <michael@databricks.com>2014-11-17 15:33:13 -0800
commit5ce7dae859dc273b0fc532c9456b5960b1eca399 (patch)
tree5ce02e2826733a84e6cfd659886cfc023deefed6 /sql
parent0f3ceb56c78e7260725a09fba0e10aa193cbda4b (diff)
downloadspark-5ce7dae859dc273b0fc532c9456b5960b1eca399.tar.gz
spark-5ce7dae859dc273b0fc532c9456b5960b1eca399.tar.bz2
spark-5ce7dae859dc273b0fc532c9456b5960b1eca399.zip
[SQL] Makes conjunction pushdown more aggressive for in-memory table
This is inspired by the [Parquet record filter generation code](https://github.com/apache/spark/blob/64c6b9bad559c21f25cd9fbe37c8813cdab939f2/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala#L387-L400). <!-- Reviewable:start --> [<img src="https://reviewable.io/review_button.png" height=40 alt="Review on Reviewable"/>](https://reviewable.io/reviews/apache/spark/3318) <!-- Reviewable:end --> Author: Cheng Lian <lian@databricks.com> Closes #3318 from liancheng/aggresive-conj-pushdown and squashes the following commits: 78b69d2 [Cheng Lian] Makes conjunction pushdown more aggressive
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala4
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala12
2 files changed, 11 insertions, 5 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index 455b415d9d..881d32b105 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -182,8 +182,8 @@ private[sql] case class InMemoryColumnarTableScan(
// to evaluate to `true' based on statistics collected about this partition batch.
val buildFilter: PartialFunction[Expression, Expression] = {
case And(lhs: Expression, rhs: Expression)
- if buildFilter.isDefinedAt(lhs) && buildFilter.isDefinedAt(rhs) =>
- buildFilter(lhs) && buildFilter(rhs)
+ if buildFilter.isDefinedAt(lhs) || buildFilter.isDefinedAt(rhs) =>
+ (buildFilter.lift(lhs) ++ buildFilter.lift(rhs)).reduce(_ && _)
case Or(lhs: Expression, rhs: Expression)
if buildFilter.isDefinedAt(lhs) && buildFilter.isDefinedAt(rhs) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
index 9ba3c21017..82afa31a99 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/PartitionBatchPruningSuite.scala
@@ -78,17 +78,23 @@ class PartitionBatchPruningSuite extends FunSuite with BeforeAndAfterAll with Be
// Conjunction and disjunction
checkBatchPruning("SELECT key FROM pruningData WHERE key > 8 AND key <= 21", 2, 3)(9 to 21)
checkBatchPruning("SELECT key FROM pruningData WHERE key < 2 OR key > 99", 2, 2)(Seq(1, 100))
+ checkBatchPruning("SELECT key FROM pruningData WHERE key < 12 AND key IS NOT NULL", 1, 2)(1 to 11)
checkBatchPruning("SELECT key FROM pruningData WHERE key < 2 OR (key > 78 AND key < 92)", 3, 4) {
Seq(1) ++ (79 to 91)
}
+ checkBatchPruning("SELECT key FROM pruningData WHERE NOT (key < 88)", 1, 2) {
+ // Although the `NOT` operator isn't supported directly, the optimizer can transform
+ // `NOT (a < b)` to `b >= a`
+ 88 to 100
+ }
// With unsupported predicate
- checkBatchPruning("SELECT key FROM pruningData WHERE NOT (key < 88)", 1, 2)(88 to 100)
- checkBatchPruning("SELECT key FROM pruningData WHERE key < 12 AND key IS NOT NULL", 1, 2)(1 to 11)
-
{
val seq = (1 to 30).mkString(", ")
checkBatchPruning(s"SELECT key FROM pruningData WHERE NOT (key IN ($seq))", 5, 10)(31 to 100)
+ checkBatchPruning(s"SELECT key FROM pruningData WHERE NOT (key IN ($seq)) AND key > 88", 1, 2) {
+ 89 to 100
+ }
}
def checkBatchPruning(