diff options
author | Burak Yavuz <brkyvz@gmail.com> | 2015-05-04 17:02:49 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-04 17:02:49 -0700 |
commit | 80554111703c08e2bedbe303e04ecd162ec119e1 (patch) | |
tree | fc2ea97df7c1111f33020329d93a9338ecd5fecb /python/pyspark/sql | |
parent | fc8b58195afa67fbb75b4c8303e022f703cbf007 (diff) | |
download | spark-80554111703c08e2bedbe303e04ecd162ec119e1.tar.gz spark-80554111703c08e2bedbe303e04ecd162ec119e1.tar.bz2 spark-80554111703c08e2bedbe303e04ecd162ec119e1.zip |
[SPARK-7243][SQL] Contingency Tables for DataFrames
Computes a pair-wise frequency table of the given columns. Also known as cross-tabulation.
cc mengxr rxin
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5842 from brkyvz/df-cont and squashes the following commits:
a07c01e [Burak Yavuz] addressed comments v4.1
ae9e01d [Burak Yavuz] fix test
9106585 [Burak Yavuz] addressed comments v4.0
bced829 [Burak Yavuz] fix merge conflicts
a63ad00 [Burak Yavuz] addressed comments v3.0
a0cad97 [Burak Yavuz] addressed comments v3.0
6805df8 [Burak Yavuz] addressed comments and fixed test
939b7c4 [Burak Yavuz] lint python
7f098bc [Burak Yavuz] add crosstab pyTest
fd53b00 [Burak Yavuz] added python support for crosstab
27a5a81 [Burak Yavuz] implemented crosstab
Diffstat (limited to 'python/pyspark/sql')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 25 | ||||
-rw-r--r-- | python/pyspark/sql/tests.py | 9 |
2 files changed, 34 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 22762c5bbb..f30a92dfc8 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -931,6 +931,26 @@ class DataFrame(object): raise ValueError("col2 should be a string.") return self._jdf.stat().cov(col1, col2) + def crosstab(self, col1, col2): + """ + Computes a pair-wise frequency table of the given columns. Also known as a contingency + table. The number of distinct values for each column should be less than 1e4. The first + column of each row will be the distinct values of `col1` and the column names will be the + distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that + have no occurrences will have `null` as their counts. + :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases. + + :param col1: The name of the first column. Distinct items will make the first item of + each row. + :param col2: The name of the second column. Distinct items will make the column names + of the DataFrame. + """ + if not isinstance(col1, str): + raise ValueError("col1 should be a string.") + if not isinstance(col2, str): + raise ValueError("col2 should be a string.") + return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx) + def freqItems(self, cols, support=None): """ Finding frequent items for columns, possibly with false positives. Using the @@ -1423,6 +1443,11 @@ class DataFrameStatFunctions(object): cov.__doc__ = DataFrame.cov.__doc__ + def crosstab(self, col1, col2): + return self.df.crosstab(col1, col2) + + crosstab.__doc__ = DataFrame.crosstab.__doc__ + def freqItems(self, cols, support=None): return self.df.freqItems(cols, support) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index d652c302a5..7ea6656d31 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -405,6 +405,15 @@ class SQLTests(ReusedPySparkTestCase): cov = df.stat.cov("a", "b") self.assertTrue(abs(cov - 55.0 / 3) < 1e-6) + def test_crosstab(self): + df = self.sc.parallelize([Row(a=i % 3, b=i % 2) for i in range(1, 7)]).toDF() + ct = df.stat.crosstab("a", "b").collect() + ct = sorted(ct, key=lambda x: x[0]) + for i, row in enumerate(ct): + self.assertEqual(row[0], str(i)) + self.assertTrue(row[1], 1) + self.assertTrue(row[2], 1) + def test_math_functions(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() from pyspark.sql import mathfunctions as functions |