diff options
author | Cheng Lian <lian@databricks.com> | 2016-01-26 20:12:34 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-01-26 20:12:34 -0800 |
commit | ce38a35b764397fcf561ac81de6da96579f5c13e (patch) | |
tree | 0f03dfb31f4840488fabc75d5b4edbdc7eb0d874 /common/sketch/src | |
parent | e7f9199e709c46a6b5ad6b03c9ecf12cc19e3a41 (diff) | |
download | spark-ce38a35b764397fcf561ac81de6da96579f5c13e.tar.gz spark-ce38a35b764397fcf561ac81de6da96579f5c13e.tar.bz2 spark-ce38a35b764397fcf561ac81de6da96579f5c13e.zip |
[SPARK-12935][SQL] DataFrame API for Count-Min Sketch
This PR integrates Count-Min Sketch from spark-sketch into DataFrame. This version resorts to `RDD.aggregate` for building the sketch. A more performant UDAF version can be built in future follow-up PRs.
Author: Cheng Lian <lian@databricks.com>
Closes #10911 from liancheng/cms-df-api.
Diffstat (limited to 'common/sketch/src')
3 files changed, 56 insertions, 36 deletions
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java index 00378d5851..d392fb187a 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java @@ -47,10 +47,12 @@ public abstract class BloomFilter { public enum Version { /** * {@code BloomFilter} binary format version 1 (all values written in big-endian order): - * - Version number, always 1 (32 bit) - * - Total number of words of the underlying bit array (32 bit) - * - The words/longs (numWords * 64 bit) - * - Number of hash functions (32 bit) + * <ul> + * <li>Version number, always 1 (32 bit)</li> + * <li>Total number of words of the underlying bit array (32 bit)</li> + * <li>The words/longs (numWords * 64 bit)</li> + * <li>Number of hash functions (32 bit)</li> + * </ul> */ V1(1); diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java index 00c0b1b9e2..5692e574d4 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java @@ -59,16 +59,22 @@ abstract public class CountMinSketch { public enum Version { /** * {@code CountMinSketch} binary format version 1 (all values written in big-endian order): - * - Version number, always 1 (32 bit) - * - Total count of added items (64 bit) - * - Depth (32 bit) - * - Width (32 bit) - * - Hash functions (depth * 64 bit) - * - Count table - * - Row 0 (width * 64 bit) - * - Row 1 (width * 64 bit) - * - ... - * - Row depth - 1 (width * 64 bit) + * <ul> + * <li>Version number, always 1 (32 bit)</li> + * <li>Total count of added items (64 bit)</li> + * <li>Depth (32 bit)</li> + * <li>Width (32 bit)</li> + * <li>Hash functions (depth * 64 bit)</li> + * <li> + * Count table + * <ul> + * <li>Row 0 (width * 64 bit)</li> + * <li>Row 1 (width * 64 bit)</li> + * <li>...</li> + * <li>Row {@code depth - 1} (width * 64 bit)</li> + * </ul> + * </li> + * </ul> */ V1(1); diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java index d08809605a..8cc29e4076 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java @@ -21,13 +21,16 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.io.OutputStream; +import java.io.Serializable; import java.io.UnsupportedEncodingException; import java.util.Arrays; import java.util.Random; -class CountMinSketchImpl extends CountMinSketch { - public static final long PRIME_MODULUS = (1L << 31) - 1; +class CountMinSketchImpl extends CountMinSketch implements Serializable { + private static final long PRIME_MODULUS = (1L << 31) - 1; private int depth; private int width; @@ -37,6 +40,9 @@ class CountMinSketchImpl extends CountMinSketch { private double eps; private double confidence; + private CountMinSketchImpl() { + } + CountMinSketchImpl(int depth, int width, int seed) { this.depth = depth; this.width = width; @@ -55,16 +61,6 @@ class CountMinSketchImpl extends CountMinSketch { initTablesWith(depth, width, seed); } - CountMinSketchImpl(int depth, int width, long totalCount, long hashA[], long table[][]) { - this.depth = depth; - this.width = width; - this.eps = 2.0 / width; - this.confidence = 1 - 1 / Math.pow(2, depth); - this.hashA = hashA; - this.table = table; - this.totalCount = totalCount; - } - @Override public boolean equals(Object other) { if (other == this) { @@ -325,27 +321,43 @@ class CountMinSketchImpl extends CountMinSketch { } public static CountMinSketchImpl readFrom(InputStream in) throws IOException { + CountMinSketchImpl sketch = new CountMinSketchImpl(); + sketch.readFrom0(in); + return sketch; + } + + private void readFrom0(InputStream in) throws IOException { DataInputStream dis = new DataInputStream(in); - // Ignores version number - dis.readInt(); + int version = dis.readInt(); + if (version != Version.V1.getVersionNumber()) { + throw new IOException("Unexpected Count-Min Sketch version number (" + version + ")"); + } - long totalCount = dis.readLong(); - int depth = dis.readInt(); - int width = dis.readInt(); + this.totalCount = dis.readLong(); + this.depth = dis.readInt(); + this.width = dis.readInt(); + this.eps = 2.0 / width; + this.confidence = 1 - 1 / Math.pow(2, depth); - long hashA[] = new long[depth]; + this.hashA = new long[depth]; for (int i = 0; i < depth; ++i) { - hashA[i] = dis.readLong(); + this.hashA[i] = dis.readLong(); } - long table[][] = new long[depth][width]; + this.table = new long[depth][width]; for (int i = 0; i < depth; ++i) { for (int j = 0; j < width; ++j) { - table[i][j] = dis.readLong(); + this.table[i][j] = dis.readLong(); } } + } + + private void writeObject(ObjectOutputStream out) throws IOException { + this.writeTo(out); + } - return new CountMinSketchImpl(depth, width, totalCount, hashA, table); + private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { + this.readFrom0(in); } } |