aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 22762c5bbb..f30a92dfc8 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -931,6 +931,26 @@ class DataFrame(object):
raise ValueError("col2 should be a string.")
return self._jdf.stat().cov(col1, col2)
+ def crosstab(self, col1, col2):
+ """
+ Computes a pair-wise frequency table of the given columns. Also known as a contingency
+ table. The number of distinct values for each column should be less than 1e4. The first
+ column of each row will be the distinct values of `col1` and the column names will be the
+ distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
+ have no occurrences will have `null` as their counts.
+ :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
+
+ :param col1: The name of the first column. Distinct items will make the first item of
+ each row.
+ :param col2: The name of the second column. Distinct items will make the column names
+ of the DataFrame.
+ """
+ if not isinstance(col1, str):
+ raise ValueError("col1 should be a string.")
+ if not isinstance(col2, str):
+ raise ValueError("col2 should be a string.")
+ return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)
+
def freqItems(self, cols, support=None):
"""
Finding frequent items for columns, possibly with false positives. Using the
@@ -1423,6 +1443,11 @@ class DataFrameStatFunctions(object):
cov.__doc__ = DataFrame.cov.__doc__
+ def crosstab(self, col1, col2):
+ return self.df.crosstab(col1, col2)
+
+ crosstab.__doc__ = DataFrame.crosstab.__doc__
+
def freqItems(self, cols, support=None):
return self.df.freqItems(cols, support)