[SPARK-12706] [SQL] grouping() and grouping_id()

Grouping() returns a column is aggregated or not, grouping_id() returns the aggregation levels. grouping()/grouping_id() could be used with window function, but does not work in having/sort clause, will be fixed by another PR. The GROUPING__ID/grouping_id() in Hive is wrong (according to docs), we also did it wrongly, this PR change that to match the behavior in most databases (also the docs of Hive). Author: Davies Liu <davies@databricks.com> Closes #10677 from davies/grouping.
author: Davies Liu <davies@databricks.com> 2016-02-10 20:13:38 -0800
committer: Davies Liu <davies.liu@gmail.com> 2016-02-10 20:13:38 -0800
commit: b5761d150b66ee0ae5f1be897d9d7a1abb039884 (patch)
tree: 4d2f839c621b844f09d7e5045c23156cec3a12a6 /python/pyspark/sql/dataframe.py
parent: 0f09f0226983cdc409ef504dff48395787dc844f (diff)
download: spark-b5761d150b66ee0ae5f1be897d9d7a1abb039884.tar.gz
spark-b5761d150b66ee0ae5f1be897d9d7a1abb039884.tar.bz2
spark-b5761d150b66ee0ae5f1be897d9d7a1abb039884.zip
1 files changed, 11 insertions, 11 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3a8c8305ee..3104e41407 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -887,8 +887,8 @@ class DataFrame(object):
         [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
         >>> sorted(df.groupBy(df.name).avg().collect())
         [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]
-        >>> df.groupBy(['name', df.age]).count().collect()
-        [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)]
+        >>> sorted(df.groupBy(['name', df.age]).count().collect())
+        [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)]
         """
         jgd = self._jdf.groupBy(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
@@ -900,15 +900,15 @@ class DataFrame(object):
         Create a multi-dimensional rollup for the current :class:`DataFrame` using
         the specified columns, so we can run aggregation on them.
 
-        >>> df.rollup('name', df.age).count().show()
+        >>> df.rollup("name", df.age).count().orderBy("name", "age").show()
         +-----+----+-----+
         | name| age|count|
         +-----+----+-----+
-        |Alice|   2|    1|
-        |  Bob|   5|    1|
-        |  Bob|null|    1|
         | null|null|    2|
         |Alice|null|    1|
+        |Alice|   2|    1|
+        |  Bob|null|    1|
+        |  Bob|   5|    1|
         +-----+----+-----+
         """
         jgd = self._jdf.rollup(self._jcols(*cols))
@@ -921,17 +921,17 @@ class DataFrame(object):
         Create a multi-dimensional cube for the current :class:`DataFrame` using
         the specified columns, so we can run aggregation on them.
 
-        >>> df.cube('name', df.age).count().show()
+        >>> df.cube("name", df.age).count().orderBy("name", "age").show()
         +-----+----+-----+
         | name| age|count|
         +-----+----+-----+
+        | null|null|    2|
         | null|   2|    1|
-        |Alice|   2|    1|
-        |  Bob|   5|    1|
         | null|   5|    1|
-        |  Bob|null|    1|
-        | null|null|    2|
         |Alice|null|    1|
+        |Alice|   2|    1|
+        |  Bob|null|    1|
+        |  Bob|   5|    1|
         +-----+----+-----+
         """
         jgd = self._jdf.cube(self._jcols(*cols))
author	Davies Liu <davies@databricks.com>	2016-02-10 20:13:38 -0800
committer	Davies Liu <davies.liu@gmail.com>	2016-02-10 20:13:38 -0800
commit	b5761d150b66ee0ae5f1be897d9d7a1abb039884 (patch)
tree	4d2f839c621b844f09d7e5045c23156cec3a12a6 /python/pyspark/sql/dataframe.py
parent	0f09f0226983cdc409ef504dff48395787dc844f (diff)
download	spark-b5761d150b66ee0ae5f1be897d9d7a1abb039884.tar.gz spark-b5761d150b66ee0ae5f1be897d9d7a1abb039884.tar.bz2 spark-b5761d150b66ee0ae5f1be897d9d7a1abb039884.zip