aboutsummaryrefslogtreecommitdiff
path: root/common/sketch
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-01-26 20:12:34 -0800
committerReynold Xin <rxin@databricks.com>2016-01-26 20:12:34 -0800
commitce38a35b764397fcf561ac81de6da96579f5c13e (patch)
tree0f03dfb31f4840488fabc75d5b4edbdc7eb0d874 /common/sketch
parente7f9199e709c46a6b5ad6b03c9ecf12cc19e3a41 (diff)
downloadspark-ce38a35b764397fcf561ac81de6da96579f5c13e.tar.gz
spark-ce38a35b764397fcf561ac81de6da96579f5c13e.tar.bz2
spark-ce38a35b764397fcf561ac81de6da96579f5c13e.zip
[SPARK-12935][SQL] DataFrame API for Count-Min Sketch
This PR integrates Count-Min Sketch from spark-sketch into DataFrame. This version resorts to `RDD.aggregate` for building the sketch. A more performant UDAF version can be built in future follow-up PRs. Author: Cheng Lian <lian@databricks.com> Closes #10911 from liancheng/cms-df-api.
Diffstat (limited to 'common/sketch')
-rw-r--r--common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java10
-rw-r--r--common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java26
-rw-r--r--common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java56
3 files changed, 56 insertions, 36 deletions
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
index 00378d5851..d392fb187a 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
@@ -47,10 +47,12 @@ public abstract class BloomFilter {
public enum Version {
/**
* {@code BloomFilter} binary format version 1 (all values written in big-endian order):
- * - Version number, always 1 (32 bit)
- * - Total number of words of the underlying bit array (32 bit)
- * - The words/longs (numWords * 64 bit)
- * - Number of hash functions (32 bit)
+ * <ul>
+ * <li>Version number, always 1 (32 bit)</li>
+ * <li>Total number of words of the underlying bit array (32 bit)</li>
+ * <li>The words/longs (numWords * 64 bit)</li>
+ * <li>Number of hash functions (32 bit)</li>
+ * </ul>
*/
V1(1);
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java
index 00c0b1b9e2..5692e574d4 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketch.java
@@ -59,16 +59,22 @@ abstract public class CountMinSketch {
public enum Version {
/**
* {@code CountMinSketch} binary format version 1 (all values written in big-endian order):
- * - Version number, always 1 (32 bit)
- * - Total count of added items (64 bit)
- * - Depth (32 bit)
- * - Width (32 bit)
- * - Hash functions (depth * 64 bit)
- * - Count table
- * - Row 0 (width * 64 bit)
- * - Row 1 (width * 64 bit)
- * - ...
- * - Row depth - 1 (width * 64 bit)
+ * <ul>
+ * <li>Version number, always 1 (32 bit)</li>
+ * <li>Total count of added items (64 bit)</li>
+ * <li>Depth (32 bit)</li>
+ * <li>Width (32 bit)</li>
+ * <li>Hash functions (depth * 64 bit)</li>
+ * <li>
+ * Count table
+ * <ul>
+ * <li>Row 0 (width * 64 bit)</li>
+ * <li>Row 1 (width * 64 bit)</li>
+ * <li>...</li>
+ * <li>Row {@code depth - 1} (width * 64 bit)</li>
+ * </ul>
+ * </li>
+ * </ul>
*/
V1(1);
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
index d08809605a..8cc29e4076 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
@@ -21,13 +21,16 @@ import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
import java.io.OutputStream;
+import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Random;
-class CountMinSketchImpl extends CountMinSketch {
- public static final long PRIME_MODULUS = (1L << 31) - 1;
+class CountMinSketchImpl extends CountMinSketch implements Serializable {
+ private static final long PRIME_MODULUS = (1L << 31) - 1;
private int depth;
private int width;
@@ -37,6 +40,9 @@ class CountMinSketchImpl extends CountMinSketch {
private double eps;
private double confidence;
+ private CountMinSketchImpl() {
+ }
+
CountMinSketchImpl(int depth, int width, int seed) {
this.depth = depth;
this.width = width;
@@ -55,16 +61,6 @@ class CountMinSketchImpl extends CountMinSketch {
initTablesWith(depth, width, seed);
}
- CountMinSketchImpl(int depth, int width, long totalCount, long hashA[], long table[][]) {
- this.depth = depth;
- this.width = width;
- this.eps = 2.0 / width;
- this.confidence = 1 - 1 / Math.pow(2, depth);
- this.hashA = hashA;
- this.table = table;
- this.totalCount = totalCount;
- }
-
@Override
public boolean equals(Object other) {
if (other == this) {
@@ -325,27 +321,43 @@ class CountMinSketchImpl extends CountMinSketch {
}
public static CountMinSketchImpl readFrom(InputStream in) throws IOException {
+ CountMinSketchImpl sketch = new CountMinSketchImpl();
+ sketch.readFrom0(in);
+ return sketch;
+ }
+
+ private void readFrom0(InputStream in) throws IOException {
DataInputStream dis = new DataInputStream(in);
- // Ignores version number
- dis.readInt();
+ int version = dis.readInt();
+ if (version != Version.V1.getVersionNumber()) {
+ throw new IOException("Unexpected Count-Min Sketch version number (" + version + ")");
+ }
- long totalCount = dis.readLong();
- int depth = dis.readInt();
- int width = dis.readInt();
+ this.totalCount = dis.readLong();
+ this.depth = dis.readInt();
+ this.width = dis.readInt();
+ this.eps = 2.0 / width;
+ this.confidence = 1 - 1 / Math.pow(2, depth);
- long hashA[] = new long[depth];
+ this.hashA = new long[depth];
for (int i = 0; i < depth; ++i) {
- hashA[i] = dis.readLong();
+ this.hashA[i] = dis.readLong();
}
- long table[][] = new long[depth][width];
+ this.table = new long[depth][width];
for (int i = 0; i < depth; ++i) {
for (int j = 0; j < width; ++j) {
- table[i][j] = dis.readLong();
+ this.table[i][j] = dis.readLong();
}
}
+ }
+
+ private void writeObject(ObjectOutputStream out) throws IOException {
+ this.writeTo(out);
+ }
- return new CountMinSketchImpl(depth, width, totalCount, hashA, table);
+ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+ this.readFrom0(in);
}
}