aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst/src/main
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2016-08-11 09:47:19 -0700
committerDavies Liu <davies.liu@gmail.com>2016-08-11 09:47:19 -0700
commit0f72e4f04b227b9cd5d7ae5958e09b1def49420a (patch)
tree1ec2e07daa2f57687aaac833219ae4029ac076de /sql/catalyst/src/main
parent4d496802f592dca96dada73b24afc93c668a7f26 (diff)
downloadspark-0f72e4f04b227b9cd5d7ae5958e09b1def49420a.tar.gz
spark-0f72e4f04b227b9cd5d7ae5958e09b1def49420a.tar.bz2
spark-0f72e4f04b227b9cd5d7ae5958e09b1def49420a.zip
[SPARK-16958] [SQL] Reuse subqueries within the same query
## What changes were proposed in this pull request? There could be multiple subqueries that generate same results, we could re-use the result instead of running it multiple times. This PR also cleanup up how we run subqueries. For SQL query ```sql select id,(select avg(id) from t) from t where id > (select avg(id) from t) ``` The explain is ``` == Physical Plan == *Project [id#15L, Subquery subquery29 AS scalarsubquery()#35] : +- Subquery subquery29 : +- *HashAggregate(keys=[], functions=[avg(id#15L)]) : +- Exchange SinglePartition : +- *HashAggregate(keys=[], functions=[partial_avg(id#15L)]) : +- *Range (0, 1000, splits=4) +- *Filter (cast(id#15L as double) > Subquery subquery29) : +- Subquery subquery29 : +- *HashAggregate(keys=[], functions=[avg(id#15L)]) : +- Exchange SinglePartition : +- *HashAggregate(keys=[], functions=[partial_avg(id#15L)]) : +- *Range (0, 1000, splits=4) +- *Range (0, 1000, splits=4) ``` The visualized plan: ![reuse-subquery](https://cloud.githubusercontent.com/assets/40902/17573229/e578d93c-5f0d-11e6-8a3c-0150d81d3aed.png) ## How was this patch tested? Existing tests. Author: Davies Liu <davies@databricks.com> Closes #14548 from davies/subq.
Diffstat (limited to 'sql/catalyst/src/main')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala7
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala4
2 files changed, 9 insertions, 2 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
index 08cb6c0134..ac44f08897 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala
@@ -102,6 +102,13 @@ case class PredicateSubquery(
override def nullable: Boolean = nullAware
override def plan: LogicalPlan = SubqueryAlias(toString, query)
override def withNewPlan(plan: LogicalPlan): PredicateSubquery = copy(query = plan)
+ override def semanticEquals(o: Expression): Boolean = o match {
+ case p: PredicateSubquery =>
+ query.sameResult(p.query) && nullAware == p.nullAware &&
+ children.length == p.children.length &&
+ children.zip(p.children).forall(p => p._1.semanticEquals(p._2))
+ case _ => false
+ }
override def toString: String = s"predicate-subquery#${exprId.id} $conditionString"
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 8bce404735..24a2dc9d3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -538,9 +538,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
if (innerChildren.nonEmpty) {
innerChildren.init.foreach(_.generateTreeString(
- depth + 2, lastChildren :+ false :+ false, builder, verbose))
+ depth + 2, lastChildren :+ children.isEmpty :+ false, builder, verbose))
innerChildren.last.generateTreeString(
- depth + 2, lastChildren :+ false :+ true, builder, verbose)
+ depth + 2, lastChildren :+ children.isEmpty :+ true, builder, verbose)
}
if (children.nonEmpty) {