diff options
author | Burak Yavuz <brkyvz@gmail.com> | 2015-04-30 16:40:32 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-04-30 16:40:32 -0700 |
commit | 149b3ee2dac992355adbe44e989570726c1f35d0 (patch) | |
tree | 8daa8918fd98c4491a4fd9218005ab856297bcdf /sql/core/src/test/java | |
parent | 1c3e402e669d047410b00de9193adf3c329844a2 (diff) | |
download | spark-149b3ee2dac992355adbe44e989570726c1f35d0.tar.gz spark-149b3ee2dac992355adbe44e989570726c1f35d0.tar.bz2 spark-149b3ee2dac992355adbe44e989570726c1f35d0.zip |
[SPARK-7242][SQL][MLLIB] Frequent items for DataFrames
Finding frequent items with possibly false positives, using the algorithm described in `http://www.cs.umd.edu/~samir/498/karp.pdf`.
public API under:
```
df.stat.freqItems(cols: Array[String], support: Double = 0.001): DataFrame
```
The output is a local DataFrame having the input column names with `-freqItems` appended to it. This is a single pass algorithm that may return false positives, but no false negatives.
cc mengxr rxin
Let's get the implementations in, I can add python API in a follow up PR.
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5799 from brkyvz/freq-items and squashes the following commits:
a6ec82c [Burak Yavuz] addressed comments v?
39b1bba [Burak Yavuz] removed toSeq
0915e23 [Burak Yavuz] addressed comments v2.1
3a5c177 [Burak Yavuz] addressed comments v2.0
482e741 [Burak Yavuz] removed old import
38e784d [Burak Yavuz] addressed comments v1.0
8279d4d [Burak Yavuz] added default value for support
3d82168 [Burak Yavuz] made base implementation
Diffstat (limited to 'sql/core/src/test/java')
-rw-r--r-- | sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index e5c9504d21..966d879e1f 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -22,10 +22,7 @@ import com.google.common.primitives.Ints; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.DataFrame; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SQLContext; -import org.apache.spark.sql.TestData$; +import org.apache.spark.sql.*; import org.apache.spark.sql.test.TestSQLContext; import org.apache.spark.sql.test.TestSQLContext$; import org.apache.spark.sql.types.*; @@ -178,5 +175,12 @@ public class JavaDataFrameSuite { Assert.assertEquals(bean.getD().get(i), d.apply(i)); } } - + + @Test + public void testFrequentItems() { + DataFrame df = context.table("testData2"); + String[] cols = new String[]{"a"}; + DataFrame results = df.stat().freqItems(cols, 0.2); + Assert.assertTrue(results.collect()[0].getSeq(0).contains(1)); + } } |