[SPARK-7783] [SQL] [PySpark] add DataFrame.rollup/cube in Python

Author: Davies Liu <davies@databricks.com> Closes #6311 from davies/rollup and squashes the following commits: 0261db1 [Davies Liu] use @since a51ca6b [Davies Liu] Merge branch 'master' of github.com:apache/spark into rollup 8ad5af4 [Davies Liu] Update dataframe.py ade3841 [Davies Liu] add DataFrame.rollup/cube in Python
author: Davies Liu <davies@databricks.com> 2015-05-21 17:43:08 -0700
committer: Reynold Xin <rxin@databricks.com> 2015-05-21 17:43:08 -0700
commit: 17791a58159b3e4619d0367f54a4c5332342658b (patch)
tree: 4bc78cc43c47d05e1916975b65b551bdcfc8e42d /python
parent: d68ea24d60ce1aa55b06a8c107f42544d696eb41 (diff)
download: spark-17791a58159b3e4619d0367f54a4c5332342658b.tar.gz
spark-17791a58159b3e4619d0367f54a4c5332342658b.tar.bz2
spark-17791a58159b3e4619d0367f54a4c5332342658b.zip
1 files changed, 46 insertions, 2 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3fc7d0048e..132db90e69 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -801,9 +801,53 @@ class DataFrame(object):
         >>> df.groupBy(['name', df.age]).count().collect()
         [Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)]
         """
-        jdf = self._jdf.groupBy(self._jcols(*cols))
+        jgd = self._jdf.groupBy(self._jcols(*cols))
         from pyspark.sql.group import GroupedData
-        return GroupedData(jdf, self.sql_ctx)
+        return GroupedData(jgd, self.sql_ctx)
+
+    @since(1.4)
+    def rollup(self, *cols):
+        """
+        Create a multi-dimensional rollup for the current :class:`DataFrame` using
+        the specified columns, so we can run aggregation on them.
+
+        >>> df.rollup('name', df.age).count().show()
+        +-----+----+-----+
+        | name| age|count|
+        +-----+----+-----+
+        |Alice|null|    1|
+        |  Bob|   5|    1|
+        |  Bob|null|    1|
+        | null|null|    2|
+        |Alice|   2|    1|
+        +-----+----+-----+
+        """
+        jgd = self._jdf.rollup(self._jcols(*cols))
+        from pyspark.sql.group import GroupedData
+        return GroupedData(jgd, self.sql_ctx)
+
+    @since(1.4)
+    def cube(self, *cols):
+        """
+        Create a multi-dimensional cube for the current :class:`DataFrame` using
+        the specified columns, so we can run aggregation on them.
+
+        >>> df.cube('name', df.age).count().show()
+        +-----+----+-----+
+        | name| age|count|
+        +-----+----+-----+
+        | null|   2|    1|
+        |Alice|null|    1|
+        |  Bob|   5|    1|
+        |  Bob|null|    1|
+        | null|   5|    1|
+        | null|null|    2|
+        |Alice|   2|    1|
+        +-----+----+-----+
+        """
+        jgd = self._jdf.cube(self._jcols(*cols))
+        from pyspark.sql.group import GroupedData
+        return GroupedData(jgd, self.sql_ctx)
 
     @since(1.3)
     def agg(self, *exprs):
author	Davies Liu <davies@databricks.com>	2015-05-21 17:43:08 -0700
committer	Reynold Xin <rxin@databricks.com>	2015-05-21 17:43:08 -0700
commit	17791a58159b3e4619d0367f54a4c5332342658b (patch)
tree	4bc78cc43c47d05e1916975b65b551bdcfc8e42d /python
parent	d68ea24d60ce1aa55b06a8c107f42544d696eb41 (diff)
download	spark-17791a58159b3e4619d0367f54a4c5332342658b.tar.gz spark-17791a58159b3e4619d0367f54a4c5332342658b.tar.bz2 spark-17791a58159b3e4619d0367f54a4c5332342658b.zip