aboutsummaryrefslogtreecommitdiff
path: root/project
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-01-23 00:34:55 -0800
committerReynold Xin <rxin@databricks.com>2016-01-23 00:34:55 -0800
commit1c690ddafa8376c55cbc5b7a7a750200abfbe2a6 (patch)
tree1be95d50cb9c14eb6051c1f068f6f708b1a34e9c /project
parent5af5a02160b42115579003b749c4d1831bf9d48e (diff)
downloadspark-1c690ddafa8376c55cbc5b7a7a750200abfbe2a6.tar.gz
spark-1c690ddafa8376c55cbc5b7a7a750200abfbe2a6.tar.bz2
spark-1c690ddafa8376c55cbc5b7a7a750200abfbe2a6.zip
[SPARK-12933][SQL] Initial implementation of Count-Min sketch
This PR adds an initial implementation of count min sketch, contained in a new module spark-sketch under `common/sketch`. The implementation is based on the [`CountMinSketch` class in stream-lib][1]. As required by the [design doc][2], spark-sketch should have no external dependency. Two classes, `Murmur3_x86_32` and `Platform` are copied to spark-sketch from spark-unsafe for hashing facilities. They'll also be used in the upcoming bloom filter implementation. The following features will be added in future follow-up PRs: - Serialization support - DataFrame API integration [1]: https://github.com/addthis/stream-lib/blob/aac6b4d23a8686b000f80baa447e0922ecac3bcb/src/main/java/com/clearspring/analytics/stream/frequency/CountMinSketch.java [2]: https://issues.apache.org/jira/secure/attachment/12782378/BloomFilterandCount-MinSketchinSpark2.0.pdf Author: Cheng Lian <lian@databricks.com> Closes #10851 from liancheng/count-min-sketch.
Diffstat (limited to 'project')
-rw-r--r--project/SparkBuild.scala39
1 files changed, 27 insertions, 12 deletions
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 3927b88fb0..4224a65a82 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -34,13 +34,24 @@ object BuildCommons {
private val buildLocation = file(".").getAbsoluteFile.getParentFile
- val allProjects@Seq(catalyst, core, graphx, hive, hiveThriftServer, mllib, repl,
- sql, networkCommon, networkShuffle, streaming, streamingFlumeSink, streamingFlume, streamingAkka, streamingKafka,
- streamingMqtt, streamingTwitter, streamingZeromq, launcher, unsafe, testTags) =
- Seq("catalyst", "core", "graphx", "hive", "hive-thriftserver", "mllib", "repl",
- "sql", "network-common", "network-shuffle", "streaming", "streaming-flume-sink",
- "streaming-flume", "streaming-akka", "streaming-kafka", "streaming-mqtt", "streaming-twitter",
- "streaming-zeromq", "launcher", "unsafe", "test-tags").map(ProjectRef(buildLocation, _))
+ val sqlProjects@Seq(catalyst, sql, hive, hiveThriftServer) = Seq(
+ "catalyst", "sql", "hive", "hive-thriftserver"
+ ).map(ProjectRef(buildLocation, _))
+
+ val streamingProjects@Seq(
+ streaming, streamingFlumeSink, streamingFlume, streamingAkka, streamingKafka, streamingMqtt,
+ streamingTwitter, streamingZeromq
+ ) = Seq(
+ "streaming", "streaming-flume-sink", "streaming-flume", "streaming-akka", "streaming-kafka",
+ "streaming-mqtt", "streaming-twitter", "streaming-zeromq"
+ ).map(ProjectRef(buildLocation, _))
+
+ val allProjects@Seq(
+ core, graphx, mllib, repl, networkCommon, networkShuffle, launcher, unsafe, testTags, sketch, _*
+ ) = Seq(
+ "core", "graphx", "mllib", "repl", "network-common", "network-shuffle", "launcher", "unsafe",
+ "test-tags", "sketch"
+ ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
val optionallyEnabledProjects@Seq(yarn, java8Tests, sparkGangliaLgpl,
streamingKinesisAsl, dockerIntegrationTests) =
@@ -232,11 +243,15 @@ object SparkBuild extends PomBuild {
/* Enable tests settings for all projects except examples, assembly and tools */
(allProjects ++ optionallyEnabledProjects).foreach(enable(TestSettings.settings))
- // TODO: remove streamingAkka from this list after 2.0.0
- allProjects.filterNot(x => Seq(spark, hive, hiveThriftServer, catalyst, repl,
- networkCommon, networkShuffle, networkYarn, unsafe, streamingAkka, testTags).contains(x)).foreach {
- x => enable(MimaBuild.mimaSettings(sparkHome, x))(x)
- }
+ // TODO: remove streamingAkka and sketch from this list after 2.0.0
+ allProjects.filterNot { x =>
+ Seq(
+ spark, hive, hiveThriftServer, catalyst, repl, networkCommon, networkShuffle, networkYarn,
+ unsafe, streamingAkka, testTags, sketch
+ ).contains(x)
+ }.foreach { x =>
+ enable(MimaBuild.mimaSettings(sparkHome, x))(x)
+ }
/* Unsafe settings */
enable(Unsafe.settings)(unsafe)