From 2e0f3579f1fa7139c2e79bde656cbac049abbc33 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Fri, 1 May 2015 23:43:24 -0700 Subject: [SPARK-7242] added python api for freqItems in DataFrames The python api for DataFrame's plus addressed your comments from previous PR. rxin Author: Burak Yavuz Closes #5859 from brkyvz/df-freq-py2 and squashes the following commits: f9aa9ce [Burak Yavuz] addressed comments v0.1 4b25056 [Burak Yavuz] added python api for freqItems --- .../main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'sql') diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index 23652aeb7c..e8fa829477 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -43,7 +43,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { } /** - * Runs `freqItems` with a default `support` of 1%. + * Finding frequent items for columns, possibly with false positives. Using the + * frequent element count algorithm described in + * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * Uses a `default` support of 1%. * * @param cols the names of the columns to search frequent items in. * @return A Local DataFrame with the Array of frequent items for each column. @@ -55,14 +58,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { /** * Python friendly implementation for `freqItems` */ - def freqItems(cols: List[String], support: Double): DataFrame = { + def freqItems(cols: Seq[String], support: Double): DataFrame = { FrequentItems.singlePassFreqItems(df, cols, support) } /** * Python friendly implementation for `freqItems` with a default `support` of 1%. */ - def freqItems(cols: List[String]): DataFrame = { + def freqItems(cols: Seq[String]): DataFrame = { FrequentItems.singlePassFreqItems(df, cols, 0.01) } -- cgit v1.2.3