From 2ff72f60ac6e42e9511fcf4effd1df89da8dc410 Mon Sep 17 00:00:00 2001 From: haoyuan Date: Tue, 4 Sep 2012 17:55:55 -0700 Subject: add TopKWordCountRaw.scala --- .../streaming/examples/TopKWordCountRaw.scala | 86 ++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala (limited to 'streaming') diff --git a/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala b/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala new file mode 100644 index 0000000000..7c01435f85 --- /dev/null +++ b/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala @@ -0,0 +1,86 @@ +package spark.streaming.examples + +import spark.util.IntParam +import spark.storage.StorageLevel +import spark.streaming._ +import spark.streaming.StreamingContext._ + +object WordCountRaw { + def main(args: Array[String]) { + if (args.length != 7) { + System.err.println("Usage: WordCountRaw ") + System.exit(1) + } + + val Array(master, IntParam(streams), host, IntParam(port), IntParam(batchMs), + IntParam(chkptMs), IntParam(reduces)) = args + + // Create the context and set the batch size + val ssc = new StreamingContext(master, "TopKWordCountRaw") + ssc.setBatchDuration(Milliseconds(batchMs)) + + // Make sure some tasks have started on each node + ssc.sc.parallelize(1 to 1000, 1000).count() + ssc.sc.parallelize(1 to 1000, 1000).count() + ssc.sc.parallelize(1 to 1000, 1000).count() + + val rawStreams = (1 to streams).map(_ => + ssc.createRawNetworkStream[String](host, port, StorageLevel.MEMORY_ONLY_2)).toArray + val union = new UnifiedDStream(rawStreams) + + import WordCount2_ExtraFunctions._ + + val windowedCounts = union.mapPartitions(splitAndCountPartitions) + .reduceByKeyAndWindow(add _, subtract _, Seconds(30), Milliseconds(batchMs), reduces) + windowedCounts.persist(StorageLevel.MEMORY_ONLY_DESER, StorageLevel.MEMORY_ONLY_DESER_2, + Milliseconds(chkptMs)) + //windowedCounts.print() // TODO: something else? + + def topK(data: Iterator[(String, JLong)], k: Int): Iterator[(String, JLong)] = { + val taken = new Array[(String, JLong)](k) + + var i = 0 + var len = 0 + var done = false + var value: (String, JLong) = null + var swap: (String, JLong) = null + var count = 0 + + while(data.hasNext) { + value = data.next + count += 1 + println("count = " + count) + if (len == 0) { + taken(0) = value + len = 1 + } else if (len < k || value._2 > taken(len - 1)._2) { + if (len < k) { + len += 1 + } + taken(len - 1) = value + i = len - 1 + while(i > 0 && taken(i - 1)._2 < taken(i)._2) { + swap = taken(i) + taken(i) = taken(i-1) + taken(i - 1) = swap + i -= 1 + } + } + } + println("Took " + len + " out of " + count + " items") + return taken.toIterator + } + + val k = 50 + val partialTopKWindowedCounts = windowedCounts.mapPartitions(topK(_, k)) + partialTopKWindowedCounts.foreachRDD(rdd => { + val collectedCounts = rdd.collect + println("Collected " + collectedCounts.size + " items") + topK(collectedCounts.toIterator, k).foreach(println) + }) + +// windowedCounts.foreachRDD(r => println("Element count: " + r.count())) + + ssc.start() + } +} -- cgit v1.2.3