[SPARK-7243][SQL] Contingency Tables for DataFrames

Computes a pair-wise frequency table of the given columns. Also known as cross-tabulation. cc mengxr rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #5842 from brkyvz/df-cont and squashes the following commits: a07c01e [Burak Yavuz] addressed comments v4.1 ae9e01d [Burak Yavuz] fix test 9106585 [Burak Yavuz] addressed comments v4.0 bced829 [Burak Yavuz] fix merge conflicts a63ad00 [Burak Yavuz] addressed comments v3.0 a0cad97 [Burak Yavuz] addressed comments v3.0 6805df8 [Burak Yavuz] addressed comments and fixed test 939b7c4 [Burak Yavuz] lint python 7f098bc [Burak Yavuz] add crosstab pyTest fd53b00 [Burak Yavuz] added python support for crosstab 27a5a81 [Burak Yavuz] implemented crosstab
author: Burak Yavuz <brkyvz@gmail.com> 2015-05-04 17:02:49 -0700
committer: Reynold Xin <rxin@databricks.com> 2015-05-04 17:02:49 -0700
commit: 80554111703c08e2bedbe303e04ecd162ec119e1 (patch)
tree: fc2ea97df7c1111f33020329d93a9338ecd5fecb /python/pyspark/sql
parent: fc8b58195afa67fbb75b4c8303e022f703cbf007 (diff)
download: spark-80554111703c08e2bedbe303e04ecd162ec119e1.tar.gz
spark-80554111703c08e2bedbe303e04ecd162ec119e1.tar.bz2
spark-80554111703c08e2bedbe303e04ecd162ec119e1.zip
2 files changed, 34 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 22762c5bbb..f30a92dfc8 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -931,6 +931,26 @@ class DataFrame(object):
             raise ValueError("col2 should be a string.")
         return self._jdf.stat().cov(col1, col2)
 
+    def crosstab(self, col1, col2):
+        """
+        Computes a pair-wise frequency table of the given columns. Also known as a contingency
+        table. The number of distinct values for each column should be less than 1e4. The first
+        column of each row will be the distinct values of `col1` and the column names will be the
+        distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
+        have no occurrences will have `null` as their counts.
+        :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
+
+        :param col1: The name of the first column. Distinct items will make the first item of
+            each row.
+        :param col2: The name of the second column. Distinct items will make the column names
+            of the DataFrame.
+        """
+        if not isinstance(col1, str):
+            raise ValueError("col1 should be a string.")
+        if not isinstance(col2, str):
+            raise ValueError("col2 should be a string.")
+        return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
+
     def freqItems(self, cols, support=None):
         """
         Finding frequent items for columns, possibly with false positives. Using the
@@ -1423,6 +1443,11 @@ class DataFrameStatFunctions(object):
 
     cov.__doc__ = DataFrame.cov.__doc__
 
+    def crosstab(self, col1, col2):
+        return self.df.crosstab(col1, col2)
+
+    crosstab.__doc__ = DataFrame.crosstab.__doc__
+
     def freqItems(self, cols, support=None):
         return self.df.freqItems(cols, support)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index d652c302a5..7ea6656d31 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -405,6 +405,15 @@ class SQLTests(ReusedPySparkTestCase):
         cov = df.stat.cov("a", "b")
         self.assertTrue(abs(cov - 55.0 / 3) < 1e-6)
 
+    def test_crosstab(self):
+        df = self.sc.parallelize([Row(a=i % 3, b=i % 2) for i in range(1, 7)]).toDF()
+        ct = df.stat.crosstab("a", "b").collect()
+        ct = sorted(ct, key=lambda x: x[0])
+        for i, row in enumerate(ct):
+            self.assertEqual(row[0], str(i))
+            self.assertTrue(row[1], 1)
+            self.assertTrue(row[2], 1)
+
     def test_math_functions(self):
         df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
         from pyspark.sql import mathfunctions as functions
author	Burak Yavuz <brkyvz@gmail.com>	2015-05-04 17:02:49 -0700
committer	Reynold Xin <rxin@databricks.com>	2015-05-04 17:02:49 -0700
commit	80554111703c08e2bedbe303e04ecd162ec119e1 (patch)
tree	fc2ea97df7c1111f33020329d93a9338ecd5fecb /python/pyspark/sql
parent	fc8b58195afa67fbb75b4c8303e022f703cbf007 (diff)
download	spark-80554111703c08e2bedbe303e04ecd162ec119e1.tar.gz spark-80554111703c08e2bedbe303e04ecd162ec119e1.tar.bz2 spark-80554111703c08e2bedbe303e04ecd162ec119e1.zip