aboutsummaryrefslogtreecommitdiff
path: root/sql/core/src/test/java
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-04-30 16:40:32 -0700
committerReynold Xin <rxin@databricks.com>2015-04-30 16:40:32 -0700
commit149b3ee2dac992355adbe44e989570726c1f35d0 (patch)
tree8daa8918fd98c4491a4fd9218005ab856297bcdf /sql/core/src/test/java
parent1c3e402e669d047410b00de9193adf3c329844a2 (diff)
downloadspark-149b3ee2dac992355adbe44e989570726c1f35d0.tar.gz
spark-149b3ee2dac992355adbe44e989570726c1f35d0.tar.bz2
spark-149b3ee2dac992355adbe44e989570726c1f35d0.zip
[SPARK-7242][SQL][MLLIB] Frequent items for DataFrames
Finding frequent items with possibly false positives, using the algorithm described in `http://www.cs.umd.edu/~samir/498/karp.pdf`. public API under: ``` df.stat.freqItems(cols: Array[String], support: Double = 0.001): DataFrame ``` The output is a local DataFrame having the input column names with `-freqItems` appended to it. This is a single pass algorithm that may return false positives, but no false negatives. cc mengxr rxin Let's get the implementations in, I can add python API in a follow up PR. Author: Burak Yavuz <brkyvz@gmail.com> Closes #5799 from brkyvz/freq-items and squashes the following commits: a6ec82c [Burak Yavuz] addressed comments v? 39b1bba [Burak Yavuz] removed toSeq 0915e23 [Burak Yavuz] addressed comments v2.1 3a5c177 [Burak Yavuz] addressed comments v2.0 482e741 [Burak Yavuz] removed old import 38e784d [Burak Yavuz] addressed comments v1.0 8279d4d [Burak Yavuz] added default value for support 3d82168 [Burak Yavuz] made base implementation
Diffstat (limited to 'sql/core/src/test/java')
-rw-r--r--sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java14
1 files changed, 9 insertions, 5 deletions
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index e5c9504d21..966d879e1f 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -22,10 +22,7 @@ import com.google.common.primitives.Ints;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.TestData$;
+import org.apache.spark.sql.*;
import org.apache.spark.sql.test.TestSQLContext;
import org.apache.spark.sql.test.TestSQLContext$;
import org.apache.spark.sql.types.*;
@@ -178,5 +175,12 @@ public class JavaDataFrameSuite {
Assert.assertEquals(bean.getD().get(i), d.apply(i));
}
}
-
+
+ @Test
+ public void testFrequentItems() {
+ DataFrame df = context.table("testData2");
+ String[] cols = new String[]{"a"};
+ DataFrame results = df.stat().freqItems(cols, 0.2);
+ Assert.assertTrue(results.collect()[0].getSeq(0).contains(1));
+ }
}