diff options
author | Michael Armbrust <michael@databricks.com> | 2014-11-14 12:00:08 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2014-11-14 12:00:08 -0800 |
commit | 77e845ca7726ffee2d6f8e33ea56ec005dde3874 (patch) | |
tree | f562516ace6d891f2455d453559db67b7bc3b4f7 /sql/catalyst | |
parent | e421072da0ea87e7056cc3f2130ddaafc731530f (diff) | |
download | spark-77e845ca7726ffee2d6f8e33ea56ec005dde3874.tar.gz spark-77e845ca7726ffee2d6f8e33ea56ec005dde3874.tar.bz2 spark-77e845ca7726ffee2d6f8e33ea56ec005dde3874.zip |
[SPARK-4394][SQL] Data Sources API Improvements
This PR adds two features to the data sources API:
- Support for pushing down `IN` filters
- The ability for relations to optionally provide information about their `sizeInBytes`.
Author: Michael Armbrust <michael@databricks.com>
Closes #3260 from marmbrus/sourcesImprovements and squashes the following commits:
9a5e171 [Michael Armbrust] Use method instead of configuration directly
99c0e6b [Michael Armbrust] Add support for sizeInBytes.
416f167 [Michael Armbrust] Support for IN in data sources API.
2a04ab3 [Michael Armbrust] Simplify implementation of InSet.
Diffstat (limited to 'sql/catalyst')
4 files changed, 11 insertions, 12 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 1e22b2d03c..94b6fb084d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -99,10 +99,10 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { * Optimized version of In clause, when all filter values of In clause are * static. */ -case class InSet(value: Expression, hset: HashSet[Any], child: Seq[Expression]) +case class InSet(value: Expression, hset: Set[Any]) extends Predicate { - def children = child + def children = value :: Nil def nullable = true // TODO: Figure out correct nullability semantics of IN. override def toString = s"$value INSET ${hset.mkString("(", ",", ")")}" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index a4aa322fc5..f164a6c68a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -289,7 +289,7 @@ object OptimizeIn extends Rule[LogicalPlan] { case q: LogicalPlan => q transformExpressionsDown { case In(v, list) if !list.exists(!_.isInstanceOf[Literal]) => val hSet = list.map(e => e.eval(null)) - InSet(v, HashSet() ++ hSet, v +: list) + InSet(v, HashSet() ++ hSet) } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index 918996f11d..2f57be94a8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -158,13 +158,13 @@ class ExpressionEvaluationSuite extends FunSuite { val nl = Literal(null) val s = Seq(one, two) val nullS = Seq(one, two, null) - checkEvaluation(InSet(one, hS, one +: s), true) - checkEvaluation(InSet(two, hS, two +: s), true) - checkEvaluation(InSet(two, nS, two +: nullS), true) - checkEvaluation(InSet(nl, nS, nl +: nullS), true) - checkEvaluation(InSet(three, hS, three +: s), false) - checkEvaluation(InSet(three, nS, three +: nullS), false) - checkEvaluation(InSet(one, hS, one +: s) && InSet(two, hS, two +: s), true) + checkEvaluation(InSet(one, hS), true) + checkEvaluation(InSet(two, hS), true) + checkEvaluation(InSet(two, nS), true) + checkEvaluation(InSet(nl, nS), true) + checkEvaluation(InSet(three, hS), false) + checkEvaluation(InSet(three, nS), false) + checkEvaluation(InSet(one, hS) && InSet(two, hS), true) } test("MaxOf") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala index 97a78ec971..017b180c57 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala @@ -52,8 +52,7 @@ class OptimizeInSuite extends PlanTest { val optimized = Optimize(originalQuery.analyze) val correctAnswer = testRelation - .where(InSet(UnresolvedAttribute("a"), HashSet[Any]()+1+2, - UnresolvedAttribute("a") +: Seq(Literal(1),Literal(2)))) + .where(InSet(UnresolvedAttribute("a"), HashSet[Any]()+1+2)) .analyze comparePlans(optimized, correctAnswer) |