aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-05-01 23:43:24 -0700
committerReynold Xin <rxin@databricks.com>2015-05-01 23:43:24 -0700
commit2e0f3579f1fa7139c2e79bde656cbac049abbc33 (patch)
tree1c408c400a6514978eac32dd49366940f03b6beb /sql
parentb79aeb95b45ab4ae811039d452cf028d7b844132 (diff)
downloadspark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.tar.gz
spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.tar.bz2
spark-2e0f3579f1fa7139c2e79bde656cbac049abbc33.zip
[SPARK-7242] added python api for freqItems in DataFrames
The python api for DataFrame's plus addressed your comments from previous PR. rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #5859 from brkyvz/df-freq-py2 and squashes the following commits: f9aa9ce [Burak Yavuz] addressed comments v0.1 4b25056 [Burak Yavuz] added python api for freqItems
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala9
1 files changed, 6 insertions, 3 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index 23652aeb7c..e8fa829477 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -43,7 +43,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
}
/**
- * Runs `freqItems` with a default `support` of 1%.
+ * Finding frequent items for columns, possibly with false positives. Using the
+ * frequent element count algorithm described in
+ * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+ * Uses a `default` support of 1%.
*
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
@@ -55,14 +58,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
/**
* Python friendly implementation for `freqItems`
*/
- def freqItems(cols: List[String], support: Double): DataFrame = {
+ def freqItems(cols: Seq[String], support: Double): DataFrame = {
FrequentItems.singlePassFreqItems(df, cols, support)
}
/**
* Python friendly implementation for `freqItems` with a default `support` of 1%.
*/
- def freqItems(cols: List[String]): DataFrame = {
+ def freqItems(cols: Seq[String]): DataFrame = {
FrequentItems.singlePassFreqItems(df, cols, 0.01)
}