From 1c690ddafa8376c55cbc5b7a7a750200abfbe2a6 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sat, 23 Jan 2016 00:34:55 -0800 Subject: [SPARK-12933][SQL] Initial implementation of Count-Min sketch This PR adds an initial implementation of count min sketch, contained in a new module spark-sketch under `common/sketch`. The implementation is based on the [`CountMinSketch` class in stream-lib][1]. As required by the [design doc][2], spark-sketch should have no external dependency. Two classes, `Murmur3_x86_32` and `Platform` are copied to spark-sketch from spark-unsafe for hashing facilities. They'll also be used in the upcoming bloom filter implementation. The following features will be added in future follow-up PRs: - Serialization support - DataFrame API integration [1]: https://github.com/addthis/stream-lib/blob/aac6b4d23a8686b000f80baa447e0922ecac3bcb/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java [2]: https://issues.apache.org/jira/secure/attachment/12782378/BloomFilterandCount-MinSketchinSpark2.0.pdf Author: Cheng Lian Closes #10851 from liancheng/count-min-sketch. --- dev/sparktestsupport/modules.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'dev/sparktestsupport/modules.py') diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index efe58ea2e0..032c0616ed 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -113,6 +113,18 @@ hive_thriftserver = Module( ) +sketch = Module( + name="sketch", + dependencies=[], + source_file_regexes=[ + "common/sketch/", + ], + sbt_test_goals=[ + "sketch/test" + ] +) + + graphx = Module( name="graphx", dependencies=[], -- cgit v1.2.3