diff options
author | Burak Yavuz <brkyvz@gmail.com> | 2015-05-01 23:43:24 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-01 23:43:24 -0700 |
commit | 2e0f3579f1fa7139c2e79bde656cbac049abbc33 (patch) | |
tree | 1c408c400a6514978eac32dd49366940f03b6beb /python | |
parent | b79aeb95b45ab4ae811039d452cf028d7b844132 (diff) | |
download | spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.tar.gz spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.tar.bz2 spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.zip |
[SPARK-7242] added python api for freqItems in DataFrames
The python api for DataFrame's plus addressed your comments from previous PR.
rxin
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5859 from brkyvz/df-freq-py2 and squashes the following commits:
f9aa9ce [Burak Yavuz] addressed comments v0.1
4b25056 [Burak Yavuz] added python api for freqItems
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 25 | ||||
-rw-r--r-- | python/pyspark/sql/tests.py | 7 |
2 files changed, 32 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 5ff49cac55..e9fd17ed4c 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -889,6 +889,26 @@ class DataFrame(object): raise ValueError("col2 should be a string.") return self._jdf.stat().cov(col1, col2) + def freqItems(self, cols, support=None): + """ + Finding frequent items for columns, possibly with false positives. Using the + frequent element count algorithm described in + "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". + :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases. + + :param cols: Names of the columns to calculate frequent items for as a list or tuple of + strings. + :param support: The frequency with which to consider an item 'frequent'. Default is 1%. + The support must be greater than 1e-4. + """ + if isinstance(cols, tuple): + cols = list(cols) + if not isinstance(cols, list): + raise ValueError("cols must be a list or tuple of column names as strings.") + if not support: + support = 0.01 + return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx) + @ignore_unicode_prefix def withColumn(self, colName, col): """Returns a new :class:`DataFrame` by adding a column. @@ -1344,6 +1364,11 @@ class DataFrameStatFunctions(object): cov.__doc__ = DataFrame.cov.__doc__ + def freqItems(self, cols, support=None): + return self.df.freqItems(cols, support) + + freqItems.__doc__ = DataFrame.freqItems.__doc__ + def _test(): import doctest diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 44c8b6a1aa..613efc0ac0 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -375,6 +375,13 @@ class SQLTests(ReusedPySparkTestCase): self.assertEqual(self.testData, df.select(df.key, df.value).collect()) self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect()) + def test_freqItems(self): + vals = [Row(a=1, b=-2.0) if i % 2 == 0 else Row(a=i, b=i * 1.0) for i in range(100)] + df = self.sc.parallelize(vals).toDF() + items = df.stat.freqItems(("a", "b"), 0.4).collect()[0] + self.assertTrue(1 in items[0]) + self.assertTrue(-2.0 in items[1]) + def test_aggregator(self): df = self.df g = df.groupBy() |