SPARK-2226: [SQL] transform HAVING clauses with aggregate expressions that aren't in the aggregation list

This change adds an analyzer rule to 1. find expressions in `HAVING` clause filters that depend on unresolved attributes, 2. push these expressions down to the underlying aggregates, and then 3. project them away above the filter. It also enables the `HAVING` queries in the Hive compatibility suite. Author: William Benton <willb@redhat.com> Closes #1497 from willb/spark-2226 and squashes the following commits: 92c9a93 [William Benton] Removed unnecessary import f1d4f34 [William Benton] Cleanups missed in prior commit 0e1624f [William Benton] Incorporated suggestions from @marmbrus; thanks! 541d4ee [William Benton] Cleanups from review 5a12647 [William Benton] Explanatory comments and stylistic cleanups. c7f2b2c [William Benton] Whitelist HAVING queries. 29a26e3 [William Benton] Added rule to handle unresolved attributes in HAVING clauses (SPARK-2226)
author: William Benton <willb@redhat.com> 2014-07-23 16:25:32 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-07-23 16:25:32 -0700
commit: e060d3ee2d910a5a802bb29630dca6f66cc0525d (patch)
tree: 766d3ece33ff5955d8b0ffafb824beee5b7f459e /sql/catalyst
parent: 91903e0a50b0efb7217610021a628b3043004d82 (diff)
download: spark-e060d3ee2d910a5a802bb29630dca6f66cc0525d.tar.gz
spark-e060d3ee2d910a5a802bb29630dca6f66cc0525d.tar.bz2
spark-e060d3ee2d910a5a802bb29630dca6f66cc0525d.zip
1 files changed, 26 insertions, 1 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index c7188469bf..02bdb64f30 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 
-
 /**
  * A trivial [[Analyzer]] with an [[EmptyCatalog]] and [[EmptyFunctionRegistry]]. Used for testing
  * when all relations are already filled in and the analyser needs only to resolve attribute
@@ -54,6 +53,7 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
       StarExpansion ::
       ResolveFunctions ::
       GlobalAggregates ::
+      UnresolvedHavingClauseAttributes :: 
       typeCoercionRules :_*),
     Batch("Check Analysis", Once,
       CheckResolution),
@@ -152,6 +152,31 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
   }
 
   /**
+   * This rule finds expressions in HAVING clause filters that depend on
+   * unresolved attributes.  It pushes these expressions down to the underlying
+   * aggregates and then projects them away above the filter.
+   */
+  object UnresolvedHavingClauseAttributes extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
+      case filter @ Filter(havingCondition, aggregate @ Aggregate(_, originalAggExprs, _)) 
+          if !filter.resolved && aggregate.resolved && containsAggregate(havingCondition) => {
+        val evaluatedCondition = Alias(havingCondition,  "havingCondition")()
+        val aggExprsWithHaving = evaluatedCondition +: originalAggExprs
+        
+        Project(aggregate.output,
+          Filter(evaluatedCondition.toAttribute,
+            aggregate.copy(aggregateExpressions = aggExprsWithHaving)))
+      }
+      
+    }
+    
+    protected def containsAggregate(condition: Expression): Boolean =
+      condition
+        .collect { case ae: AggregateExpression => ae }
+        .nonEmpty
+  }
+
+  /**
    * When a SELECT clause has only a single expression and that expression is a
    * [[catalyst.expressions.Generator Generator]] we convert the
    * [[catalyst.plans.logical.Project Project]] to a [[catalyst.plans.logical.Generate Generate]].
author	William Benton <willb@redhat.com>	2014-07-23 16:25:32 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-07-23 16:25:32 -0700
commit	e060d3ee2d910a5a802bb29630dca6f66cc0525d (patch)
tree	766d3ece33ff5955d8b0ffafb824beee5b7f459e /sql/catalyst
parent	91903e0a50b0efb7217610021a628b3043004d82 (diff)
download	spark-e060d3ee2d910a5a802bb29630dca6f66cc0525d.tar.gz spark-e060d3ee2d910a5a802bb29630dca6f66cc0525d.tar.bz2 spark-e060d3ee2d910a5a802bb29630dca6f66cc0525d.zip