diff options
author | Wenchen Fan <wenchen@databricks.com> | 2016-01-27 13:29:09 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-01-27 13:29:09 -0800 |
commit | 680afabe78b77e4e63e793236453d69567d24290 (patch) | |
tree | 483b4a1e2669aefec50f6293408ee16f0e5dcdad /sql/core/src/test/java | |
parent | 32f741115bda5d7d7dbfcd9fe827ecbea7303ffa (diff) | |
download | spark-680afabe78b77e4e63e793236453d69567d24290.tar.gz spark-680afabe78b77e4e63e793236453d69567d24290.tar.bz2 spark-680afabe78b77e4e63e793236453d69567d24290.zip |
[SPARK-12938][SQL] DataFrame API for Bloom filter
This PR integrates Bloom filter from spark-sketch into DataFrame. This version resorts to RDD.aggregate for building the filter. A more performant UDAF version can be built in future follow-up PRs.
This PR also add 2 specify `put` version(`putBinary` and `putLong`) into `BloomFilter`, which makes it easier to build a Bloom filter over a `DataFrame`.
Author: Wenchen Fan <wenchen@databricks.com>
Closes #10937 from cloud-fan/bloom-filter.
Diffstat (limited to 'sql/core/src/test/java')
-rw-r--r-- | sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index 9cf94e72d3..0d4c128cb3 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -40,6 +40,7 @@ import org.apache.spark.sql.types.*; import org.apache.spark.util.sketch.CountMinSketch; import static org.apache.spark.sql.functions.*; import static org.apache.spark.sql.types.DataTypes.*; +import org.apache.spark.util.sketch.BloomFilter; public class JavaDataFrameSuite { private transient JavaSparkContext jsc; @@ -300,6 +301,7 @@ public class JavaDataFrameSuite { Assert.assertEquals(30000.0, actual[1].getDouble(2), 0.01); } + @Test public void testGenericLoad() { DataFrame df1 = context.read().format("text").load( Thread.currentThread().getContextClassLoader().getResource("text-suite.txt").toString()); @@ -347,4 +349,33 @@ public class JavaDataFrameSuite { Assert.assertEquals(sketch4.relativeError(), 0.001, 1e-4); Assert.assertEquals(sketch4.confidence(), 0.99, 5e-3); } + + @Test + public void testBloomFilter() { + DataFrame df = context.range(1000); + + BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); + assert (filter1.expectedFpp() - 0.03 < 1e-3); + for (int i = 0; i < 1000; i++) { + assert (filter1.mightContain(i)); + } + + BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); + assert (filter2.expectedFpp() - 0.03 < 1e-3); + for (int i = 0; i < 1000; i++) { + assert (filter2.mightContain(i * 3)); + } + + BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); + assert (filter3.bitSize() == 64 * 5); + for (int i = 0; i < 1000; i++) { + assert (filter3.mightContain(i)); + } + + BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); + assert (filter4.bitSize() == 64 * 5); + for (int i = 0; i < 1000; i++) { + assert (filter4.mightContain(i * 3)); + } + } } |