aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjiangxingbo <jiangxb1987@gmail.com>2016-11-01 11:25:11 -0700
committerReynold Xin <rxin@databricks.com>2016-11-01 11:25:11 -0700
commitd0272b436512b71f04313e109d3d21a6e9deefca (patch)
tree6e6e64f41ded2f8f6e3636626f185a9ba726a80d
parent8a538c97b556f80f67c80519af0ce879557050d5 (diff)
downloadspark-d0272b436512b71f04313e109d3d21a6e9deefca.tar.gz
spark-d0272b436512b71f04313e109d3d21a6e9deefca.tar.bz2
spark-d0272b436512b71f04313e109d3d21a6e9deefca.zip
[SPARK-18148][SQL] Misleading Error Message for Aggregation Without Window/GroupBy
## What changes were proposed in this pull request? Aggregation Without Window/GroupBy expressions will fail in `checkAnalysis`, the error message is a bit misleading, we should generate a more specific error message for this case. For example, ``` spark.read.load("/some-data") .withColumn("date_dt", to_date($"date")) .withColumn("year", year($"date_dt")) .withColumn("week", weekofyear($"date_dt")) .withColumn("user_count", count($"userId")) .withColumn("daily_max_in_week", max($"user_count").over(weeklyWindow)) ) ``` creates the following output: ``` org.apache.spark.sql.AnalysisException: expression '`randomColumn`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; ``` In the error message above, `randomColumn` doesn't appear in the query(acturally it's added by function `withColumn`), so the message is not enough for the user to address the problem. ## How was this patch tested? Manually test Before: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: expression 'tbl.`col`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;; ``` After: ``` scala> spark.sql("select col, count(col) from tbl") org.apache.spark.sql.AnalysisException: grouping expressions sequence is empty, and 'tbl.`col`' is not an aggregate function. Wrap '(count(col#231L) AS count(col)#239L)' in windowing function(s) or wrap 'tbl.`col`' in first() (or first_value) if you don't care which value you get.;; ``` Also add new test sqls in `group-by.sql`. Author: jiangxingbo <jiangxb1987@gmail.com> Closes #15672 from jiangxb1987/groupBy-empty.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala12
-rw-r--r--sql/core/src/test/resources/sql-tests/inputs/group-by.sql41
-rw-r--r--sql/core/src/test/resources/sql-tests/results/group-by.sql.out116
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala35
4 files changed, 140 insertions, 64 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 9a7c2a944b..3455a567b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -214,6 +214,18 @@ trait CheckAnalysis extends PredicateHelper {
s"appear in the arguments of an aggregate function.")
}
}
+ case e: Attribute if groupingExprs.isEmpty =>
+ // Collect all [[AggregateExpressions]]s.
+ val aggExprs = aggregateExprs.filter(_.collect {
+ case a: AggregateExpression => a
+ }.nonEmpty)
+ failAnalysis(
+ s"grouping expressions sequence is empty, " +
+ s"and '${e.sql}' is not an aggregate function. " +
+ s"Wrap '${aggExprs.map(_.sql).mkString("(", ", ", ")")}' in windowing " +
+ s"function(s) or wrap '${e.sql}' in first() (or first_value) " +
+ s"if you don't care which value you get."
+ )
case e: Attribute if !groupingExprs.exists(_.semanticEquals(e)) =>
failAnalysis(
s"expression '${e.sql}' is neither present in the group by, " +
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 6741703d9d..d950ec83d9 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -1,17 +1,34 @@
--- Temporary data.
-create temporary view myview as values 128, 256 as v(int_col);
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
+AS testData(a, b);
--- group by should produce all input rows,
-select int_col, count(*) from myview group by int_col;
+-- Aggregate with empty GroupBy expressions.
+SELECT a, COUNT(b) FROM testData;
+SELECT COUNT(a), COUNT(b) FROM testData;
--- group by should produce a single row.
-select 'foo', count(*) from myview group by 1;
+-- Aggregate with non-empty GroupBy expressions.
+SELECT a, COUNT(b) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FROM testData GROUP BY b;
+SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a;
--- group-by should not produce any rows (whole stage code generation).
-select 'foo' from myview where int_col == 0 group by 1;
+-- Aggregate grouped by literals.
+SELECT 'foo', COUNT(a) FROM testData GROUP BY 1;
--- group-by should not produce any rows (hash aggregate).
-select 'foo', approx_count_distinct(int_col) from myview where int_col == 0 group by 1;
+-- Aggregate grouped by literals (whole stage code generation).
+SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1;
--- group-by should not produce any rows (sort aggregate).
-select 'foo', max(struct(int_col)) from myview where int_col == 0 group by 1;
+-- Aggregate grouped by literals (hash aggregate).
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1;
+
+-- Aggregate grouped by literals (sort aggregate).
+SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1;
+
+-- Aggregate with complex GroupBy expressions.
+SELECT a + b, COUNT(b) FROM testData GROUP BY a + b;
+SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1;
+SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;
+
+-- Aggregate with nulls.
+SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
+FROM testData;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 9127bd4dd4..a91f04e098 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,9 +1,11 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 6
+-- Number of queries: 14
-- !query 0
-create temporary view myview as values 128, 256 as v(int_col)
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
+AS testData(a, b)
-- !query 0 schema
struct<>
-- !query 0 output
@@ -11,41 +13,121 @@ struct<>
-- !query 1
-select int_col, count(*) from myview group by int_col
+SELECT a, COUNT(b) FROM testData
-- !query 1 schema
-struct<int_col:int,count(1):bigint>
+struct<>
-- !query 1 output
-128 1
-256 1
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.;
-- !query 2
-select 'foo', count(*) from myview group by 1
+SELECT COUNT(a), COUNT(b) FROM testData
-- !query 2 schema
-struct<foo:string,count(1):bigint>
+struct<count(a):bigint,count(b):bigint>
-- !query 2 output
-foo 2
+7 7
-- !query 3
-select 'foo' from myview where int_col == 0 group by 1
+SELECT a, COUNT(b) FROM testData GROUP BY a
-- !query 3 schema
-struct<foo:string>
+struct<a:int,count(b):bigint>
-- !query 3 output
-
+1 2
+2 2
+3 2
+NULL 1
-- !query 4
-select 'foo', approx_count_distinct(int_col) from myview where int_col == 0 group by 1
+SELECT a, COUNT(b) FROM testData GROUP BY b
-- !query 4 schema
-struct<foo:string,approx_count_distinct(int_col):bigint>
+struct<>
-- !query 4 output
-
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
-- !query 5
-select 'foo', max(struct(int_col)) from myview where int_col == 0 group by 1
+SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a
-- !query 5 schema
-struct<foo:string,max(struct(int_col)):struct<int_col:int>>
+struct<count(a):bigint,count(b):bigint>
-- !query 5 output
+0 1
+2 2
+2 2
+3 2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FROM testData GROUP BY 1
+-- !query 6 schema
+struct<foo:string,count(a):bigint>
+-- !query 6 output
+foo 7
+
+
+-- !query 7
+SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1
+-- !query 7 schema
+struct<foo:string>
+-- !query 7 output
+
+
+-- !query 8
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1
+-- !query 8 schema
+struct<foo:string,approx_count_distinct(a):bigint>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1
+-- !query 9 schema
+struct<foo:string,max(struct(a)):struct<a:int>>
+-- !query 9 output
+
+
+
+-- !query 10
+SELECT a + b, COUNT(b) FROM testData GROUP BY a + b
+-- !query 10 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 10 output
+2 1
+3 2
+4 2
+5 1
+NULL 1
+
+
+-- !query 11
+SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
+
+
+-- !query 12
+SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1
+-- !query 12 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 12 output
+3 2
+4 2
+5 2
+NULL 1
+
+
+-- !query 13
+SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
+FROM testData
+-- !query 13 schema
+struct<skewness(CAST(a AS DOUBLE)):double,kurtosis(CAST(a AS DOUBLE)):double,min(a):int,max(a):int,avg(a):double,var_samp(CAST(a AS DOUBLE)):double,stddev_samp(CAST(a AS DOUBLE)):double,sum(a):bigint,count(a):bigint>
+-- !query 13 output
+-0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 1a43d0b220..9a3d93cf17 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -463,20 +463,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
)
}
- test("agg") {
- checkAnswer(
- sql("SELECT a, SUM(b) FROM testData2 GROUP BY a"),
- Seq(Row(1, 3), Row(2, 3), Row(3, 3)))
- }
-
- test("aggregates with nulls") {
- checkAnswer(
- sql("SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a)," +
- "AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM nullInts"),
- Row(0, -1.5, 1, 3, 2, 1.0, 1, 6, 3)
- )
- }
-
test("select *") {
checkAnswer(
sql("SELECT * FROM testData"),
@@ -1178,27 +1164,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
Row(1))
}
- test("throw errors for non-aggregate attributes with aggregation") {
- def checkAggregation(query: String, isInvalidQuery: Boolean = true) {
- if (isInvalidQuery) {
- val e = intercept[AnalysisException](sql(query).queryExecution.analyzed)
- assert(e.getMessage contains "group by")
- } else {
- // Should not throw
- sql(query).queryExecution.analyzed
- }
- }
-
- checkAggregation("SELECT key, COUNT(*) FROM testData")
- checkAggregation("SELECT COUNT(key), COUNT(*) FROM testData", isInvalidQuery = false)
-
- checkAggregation("SELECT value, COUNT(*) FROM testData GROUP BY key")
- checkAggregation("SELECT COUNT(value), SUM(key) FROM testData GROUP BY key", false)
-
- checkAggregation("SELECT key + 2, COUNT(*) FROM testData GROUP BY key + 1")
- checkAggregation("SELECT key + 1 + 1, COUNT(*) FROM testData GROUP BY key + 1", false)
- }
-
testQuietly(
"SPARK-16748: SparkExceptions during planning should not wrapped in TreeNodeException") {
intercept[SparkException] {