aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-05-01 23:43:24 -0700
committerReynold Xin <rxin@databricks.com>2015-05-01 23:43:24 -0700
commit2e0f3579f1fa7139c2e79bde656cbac049abbc33 (patch)
tree1c408c400a6514978eac32dd49366940f03b6beb /python/pyspark/sql/dataframe.py
parentb79aeb95b45ab4ae811039d452cf028d7b844132 (diff)
downloadspark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.tar.gz
spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.tar.bz2
spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.zip
[SPARK-7242] added python api for freqItems in DataFrames
The python api for DataFrame's plus addressed your comments from previous PR. rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #5859 from brkyvz/df-freq-py2 and squashes the following commits: f9aa9ce [Burak Yavuz] addressed comments v0.1 4b25056 [Burak Yavuz] added python api for freqItems
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 5ff49cac55..e9fd17ed4c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -889,6 +889,26 @@ class DataFrame(object):
raise ValueError("col2 should be a string.")
return self._jdf.stat().cov(col1, col2)
+ def freqItems(self, cols, support=None):
+ """
+ Finding frequent items for columns, possibly with false positives. Using the
+ frequent element count algorithm described in
+ "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
+ :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
+
+ :param cols: Names of the columns to calculate frequent items for as a list or tuple of
+ strings.
+ :param support: The frequency with which to consider an item 'frequent'. Default is 1%.
+ The support must be greater than 1e-4.
+ """
+ if isinstance(cols, tuple):
+ cols = list(cols)
+ if not isinstance(cols, list):
+ raise ValueError("cols must be a list or tuple of column names as strings.")
+ if not support:
+ support = 0.01
+ return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
+
@ignore_unicode_prefix
def withColumn(self, colName, col):
"""Returns a new :class:`DataFrame` by adding a column.
@@ -1344,6 +1364,11 @@ class DataFrameStatFunctions(object):
cov.__doc__ = DataFrame.cov.__doc__
+ def freqItems(self, cols, support=None):
+ return self.df.freqItems(cols, support)
+
+ freqItems.__doc__ = DataFrame.freqItems.__doc__
+
def _test():
import doctest