aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-05-21 17:43:08 -0700
committerReynold Xin <rxin@databricks.com>2015-05-21 17:43:08 -0700
commit17791a58159b3e4619d0367f54a4c5332342658b (patch)
tree4bc78cc43c47d05e1916975b65b551bdcfc8e42d /python
parentd68ea24d60ce1aa55b06a8c107f42544d696eb41 (diff)
downloadspark-17791a58159b3e4619d0367f54a4c5332342658b.tar.gz
spark-17791a58159b3e4619d0367f54a4c5332342658b.tar.bz2
spark-17791a58159b3e4619d0367f54a4c5332342658b.zip
[SPARK-7783] [SQL] [PySpark] add DataFrame.rollup/cube in Python
Author: Davies Liu <davies@databricks.com> Closes #6311 from davies/rollup and squashes the following commits: 0261db1 [Davies Liu] use @since a51ca6b [Davies Liu] Merge branch 'master' of github.com:apache/spark into rollup 8ad5af4 [Davies Liu] Update dataframe.py ade3841 [Davies Liu] add DataFrame.rollup/cube in Python
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/dataframe.py48
1 files changed, 46 insertions, 2 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 3fc7d0048e..132db90e69 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -801,9 +801,53 @@ class DataFrame(object):
>>> df.groupBy(['name', df.age]).count().collect()
[Row(name=u'Bob', age=5, count=1), Row(name=u'Alice', age=2, count=1)]
"""
- jdf = self._jdf.groupBy(self._jcols(*cols))
+ jgd = self._jdf.groupBy(self._jcols(*cols))
from pyspark.sql.group import GroupedData
- return GroupedData(jdf, self.sql_ctx)
+ return GroupedData(jgd, self.sql_ctx)
+
+ @since(1.4)
+ def rollup(self, *cols):
+ """
+ Create a multi-dimensional rollup for the current :class:`DataFrame` using
+ the specified columns, so we can run aggregation on them.
+
+ >>> df.rollup('name', df.age).count().show()
+ +-----+----+-----+
+ | name| age|count|
+ +-----+----+-----+
+ |Alice|null| 1|
+ | Bob| 5| 1|
+ | Bob|null| 1|
+ | null|null| 2|
+ |Alice| 2| 1|
+ +-----+----+-----+
+ """
+ jgd = self._jdf.rollup(self._jcols(*cols))
+ from pyspark.sql.group import GroupedData
+ return GroupedData(jgd, self.sql_ctx)
+
+ @since(1.4)
+ def cube(self, *cols):
+ """
+ Create a multi-dimensional cube for the current :class:`DataFrame` using
+ the specified columns, so we can run aggregation on them.
+
+ >>> df.cube('name', df.age).count().show()
+ +-----+----+-----+
+ | name| age|count|
+ +-----+----+-----+
+ | null| 2| 1|
+ |Alice|null| 1|
+ | Bob| 5| 1|
+ | Bob|null| 1|
+ | null| 5| 1|
+ | null|null| 2|
+ |Alice| 2| 1|
+ +-----+----+-----+
+ """
+ jgd = self._jdf.cube(self._jcols(*cols))
+ from pyspark.sql.group import GroupedData
+ return GroupedData(jgd, self.sql_ctx)
@since(1.3)
def agg(self, *exprs):