[SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy

## What changes were proposed in this pull request? Aggregation Without Window/GroupBy expressions will fail in `checkAnalysis`, the error message is a bit misleading, we should generate a more specific error message for this case. For example, ``` spark.read.load("/some-data") .withColumn("date_dt", to_date($"date")) .withColumn("year", year($"date_dt")) .withColumn("week", weekofyear($"date_dt")) .withColumn("user_count", count($"userId")) .withColumn("daily_max_in_week", max($"user_count").over(weeklyWindow)) ) ``` creates the following output: ``` org.apache.spark.sql.AnalysisException: expression '`randomColumn`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; ``` In the error message above, `randomColumn` doesn't appear in the query(acturally it's added by function `withColumn`), so the message is not enough for the user to address the problem. ## How was this patch tested? Manually test Before: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: expression 'tbl.`col`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;; ``` After: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: grouping expressions sequence is empty, and 'tbl.`col`' is not an aggregate function. Wrap '(count(col#231L) AS count(col)#239L)' in windowing function(s) or wrap 'tbl.`col`' in first() (or first_value) if you don't care which value you get.;; ``` Also add new test sqls in `group-by.sql`. Author: jiangxingbo <jiangxb1987@gmail.com> Closes #15672 from jiangxb1987/groupBy-empty.
author: jiangxingbo <jiangxb1987@gmail.com> 2016-11-01 11:25:11 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-11-01 11:25:11 -0700
commit: d0272b436512b71f04313e109d3d21a6e9deefca (patch)
tree: 6e6e64f41ded2f8f6e3636626f185a9ba726a80d
parent: 8a538c97b556f80f67c80519af0ce879557050d5 (diff)
download: spark-d0272b436512b71f04313e109d3d21a6e9deefca.tar.gz
spark-d0272b436512b71f04313e109d3d21a6e9deefca.tar.bz2
spark-d0272b436512b71f04313e109d3d21a6e9deefca.zip
4 files changed, 140 insertions, 64 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 9a7c2a944b..3455a567b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper {
                         s"appear in the arguments of an aggregate function.")
                   }
                 }
+              case e: Attribute if groupingExprs.isEmpty =>
+                // Collect all [[AggregateExpressions]]s.
+                val aggExprs = aggregateExprs.filter(_.collect {
+                  case a: AggregateExpression => a
+                }.nonEmpty)
+                failAnalysis(
+                  s"grouping expressions sequence is empty, " +
+                    s"and '${e.sql}' is not an aggregate function. " +
+                    s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' in windowing " +
+                    s"function(s) or wrap '${e.sql}' in first() (or first_value) " +
+                    s"if you don't care which value you get."
+                )
               case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) =>
                 failAnalysis(
                   s"expression '${e.sql}' is neither present in the group by, " +
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 6741703d9d..d950ec83d9 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -1,17 +1,34 @@
--- Temporary data.
-create temporary view myview as values 128, 256 as v(int_col);
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
+AS testData(a, b);
 
--- group by should produce all input rows,
-select int_col, count(*) from myview group by int_col;
+-- Aggregate with empty GroupBy expressions.
+SELECT a, COUNT(b) FROM testData;
+SELECT COUNT(a), COUNT(b) FROM testData;
 
--- group by should produce a single row.
-select 'foo', count(*) from myview group by 1;
+-- Aggregate with non-empty GroupBy expressions.
+SELECT a, COUNT(b) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FROM testData GROUP BY b;
+SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a;
 
--- group-by should not produce any rows (whole stage code generation).
-select 'foo' from myview where int_col == 0 group by 1;
+-- Aggregate grouped by literals.
+SELECT 'foo', COUNT(a) FROM testData GROUP BY 1;
 
--- group-by should not produce any rows (hash aggregate).
-select 'foo', approx_count_distinct(int_col) from myview where int_col == 0 group by 1;
+-- Aggregate grouped by literals (whole stage code generation).
+SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1;
 
--- group-by should not produce any rows (sort aggregate).
-select 'foo', max(struct(int_col)) from myview where int_col == 0 group by 1;
+-- Aggregate grouped by literals (hash aggregate).
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1;
+
+-- Aggregate grouped by literals (sort aggregate).
+SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1;
+
+-- Aggregate with complex GroupBy expressions.
+SELECT a + b, COUNT(b) FROM testData GROUP BY a + b;
+SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1;
+SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;
+
+-- Aggregate with nulls.
+SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
+FROM testData;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 9127bd4dd4..a91f04e098 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,9 +1,11 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 6
+-- Number of queries: 14
 
 
 -- !query 0
-create temporary view myview as values 128, 256 as v(int_col)
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
+AS testData(a, b)
 -- !query 0 schema
 struct<>
 -- !query 0 output
@@ -11,41 +13,121 @@ struct<>
 
 
 -- !query 1
-select int_col, count(*) from myview group by int_col
+SELECT a, COUNT(b) FROM testData
 -- !query 1 schema
-struct<int_col:int,count(1):bigint>
+struct<>
 -- !query 1 output
-128	1
-256	1
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.;
 
 
 -- !query 2
-select 'foo', count(*) from myview group by 1
+SELECT COUNT(a), COUNT(b) FROM testData
 -- !query 2 schema
-struct<foo:string,count(1):bigint>
+struct<count(a):bigint,count(b):bigint>
 -- !query 2 output
-foo	2
+7	7
 
 
 -- !query 3
-select 'foo' from myview where int_col == 0 group by 1
+SELECT a, COUNT(b) FROM testData GROUP BY a
 -- !query 3 schema
-struct<foo:string>
+struct<a:int,count(b):bigint>
 -- !query 3 output
-
+1	2
+2	2
+3	2
+NULL	1
 
 
 -- !query 4
-select 'foo', approx_count_distinct(int_col) from myview where int_col == 0 group by 1
+SELECT a, COUNT(b) FROM testData GROUP BY b
 -- !query 4 schema
-struct<foo:string,approx_count_distinct(int_col):bigint>
+struct<>
 -- !query 4 output
-
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
 
 
 -- !query 5
-select 'foo', max(struct(int_col)) from myview where int_col == 0 group by 1
+SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a
 -- !query 5 schema
-struct<foo:string,max(struct(int_col)):struct<int_col:int>>
+struct<count(a):bigint,count(b):bigint>
 -- !query 5 output
+0	1
+2	2
+2	2
+3	2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FROM testData GROUP BY 1
+-- !query 6 schema
+struct<foo:string,count(a):bigint>
+-- !query 6 output
+foo	7
+
+
+-- !query 7
+SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1
+-- !query 7 schema
+struct<foo:string>
+-- !query 7 output
+
 
+
+-- !query 8
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1
+-- !query 8 schema
+struct<foo:string,approx_count_distinct(a):bigint>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1
+-- !query 9 schema
+struct<foo:string,max(struct(a)):struct<a:int>>
+-- !query 9 output
+
+
+
+-- !query 10
+SELECT a + b, COUNT(b) FROM testData GROUP BY a + b
+-- !query 10 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 10 output
+2	1
+3	2
+4	2
+5	1
+NULL	1
+
+
+-- !query 11
+SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
+
+
+-- !query 12
+SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1
+-- !query 12 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 12 output
+3	2
+4	2
+5	2
+NULL	1
+
+
+-- !query 13
+SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
+FROM testData
+-- !query 13 schema
+struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min(a):int,max(a):int,avg(a):double,var_samp(CAST(a AS DOUBLE)):double,stddev_samp(CAST(a AS DOUBLE)):double,sum(a):bigint,count(a):bigint>
+-- !query 13 output
+-0.2723801058145729	-1.5069204152249134	1	3	2.142857142857143	0.8095238095238094	0.8997354108424372	15	7
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 1a43d0b220..9a3d93cf17 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -463,20 +463,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     )
   }
 
-  test("agg") {
-    checkAnswer(
-      sql("SELECT a, SUM(b) FROM testData2 GROUP BY a"),
-      Seq(Row(1, 3), Row(2, 3), Row(3, 3)))
-  }
-
-  test("aggregates with nulls") {
-    checkAnswer(
-      sql("SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a)," +
-        "AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
-      Row(0, -1.5, 1, 3, 2, 1.0, 1, 6, 3)
-    )
-  }
-
   test("select *") {
     checkAnswer(
       sql("SELECT * FROM testData"),
@@ -1178,27 +1164,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(1))
   }
 
-  test("throw errors for non-aggregate attributes with aggregation") {
-    def checkAggregation(query: String, isInvalidQuery: Boolean = true) {
-      if (isInvalidQuery) {
-        val e = intercept[AnalysisException](sql(query).queryExecution.analyzed)
-        assert(e.getMessage contains "group by")
-      } else {
-        // Should not throw
-        sql(query).queryExecution.analyzed
-      }
-    }
-
-    checkAggregation("SELECT key, COUNT(*) FROM testData")
-    checkAggregation("SELECT COUNT(key), COUNT(*) FROM testData", isInvalidQuery = false)
-
-    checkAggregation("SELECT value, COUNT(*) FROM testData GROUP BY key")
-    checkAggregation("SELECT COUNT(value), SUM(key) FROM testData GROUP BY key", false)
-
-    checkAggregation("SELECT key + 2, COUNT(*) FROM testData GROUP BY key + 1")
-    checkAggregation("SELECT key + 1 + 1, COUNT(*) FROM testData GROUP BY key + 1", false)
-  }
-
   testQuietly(
     "SPARK-16748: SparkExceptions during planning should not wrapped in TreeNodeException") {
     intercept[SparkException] {
author	jiangxingbo <jiangxb1987@gmail.com>	2016-11-01 11:25:11 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-11-01 11:25:11 -0700
commit	d0272b436512b71f04313e109d3d21a6e9deefca (patch)
tree	6e6e64f41ded2f8f6e3636626f185a9ba726a80d
parent	8a538c97b556f80f67c80519af0ce879557050d5 (diff)
download	spark-d0272b436512b71f04313e109d3d21a6e9deefca.tar.gz spark-d0272b436512b71f04313e109d3d21a6e9deefca.tar.bz2 spark-d0272b436512b71f04313e109d3d21a6e9deefca.zip