From c97ebf64377e853ab7c616a103869a4417f25954 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 19 Nov 2012 23:22:07 +0000
Subject: Fixed bug in the number of splits in RDD after checkpointing.
 Modified reduceByKeyAndWindow (naive) computation from window+reduceByKey to
 reduceByKey+window+reduceByKey.

---
 streaming/src/main/scala/spark/streaming/DStream.scala              | 3 ++-
 streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala | 6 +++++-
 streaming/src/main/scala/spark/streaming/Scheduler.scala            | 2 +-
 streaming/src/main/scala/spark/streaming/WindowedDStream.scala      | 3 +++
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'streaming')

diff --git a/streaming/src/main/scala/spark/streaming/DStream.scala b/streaming/src/main/scala/spark/streaming/DStream.scala
index 13770aa8fd..26d5ce9198 100644
--- a/streaming/src/main/scala/spark/streaming/DStream.scala
+++ b/streaming/src/main/scala/spark/streaming/DStream.scala
@@ -321,7 +321,8 @@ extends Serializable with Logging {
         }
       }
     }
-    logInfo("Updated checkpoint data for time " + currentTime)
+    logInfo("Updated checkpoint data for time " + currentTime + ", " + checkpointData.size + " checkpoints, " 
+      + "[" + checkpointData.mkString(",") + "]")
   }
 
   /**
diff --git a/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala b/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala
index e09d27d34f..720e63bba0 100644
--- a/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala
@@ -4,6 +4,7 @@ import spark.streaming.StreamingContext._
 
 import spark.{Manifests, RDD, Partitioner, HashPartitioner}
 import spark.SparkContext._
+import spark.storage.StorageLevel
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -115,7 +116,10 @@ extends Serializable {
       slideTime: Time,
       partitioner: Partitioner
     ): DStream[(K, V)] = {
-    self.window(windowTime, slideTime).reduceByKey(ssc.sc.clean(reduceFunc), partitioner)
+    val cleanedReduceFunc = ssc.sc.clean(reduceFunc)
+    self.reduceByKey(cleanedReduceFunc, partitioner)
+        .window(windowTime, slideTime)
+        .reduceByKey(cleanedReduceFunc, partitioner)
   }
 
   // This method is the efficient sliding window reduce operation,
diff --git a/streaming/src/main/scala/spark/streaming/Scheduler.scala b/streaming/src/main/scala/spark/streaming/Scheduler.scala
index e2dca91179..014021be61 100644
--- a/streaming/src/main/scala/spark/streaming/Scheduler.scala
+++ b/streaming/src/main/scala/spark/streaming/Scheduler.scala
@@ -17,7 +17,7 @@ extends Logging {
 
   val graph = ssc.graph
 
-  val concurrentJobs = System.getProperty("spark.stream.concurrentJobs", "1").toInt
+  val concurrentJobs = System.getProperty("spark.streaming.concurrentJobs", "1").toInt
   val jobManager = new JobManager(ssc, concurrentJobs)
 
   val checkpointWriter = if (ssc.checkpointInterval != null && ssc.checkpointDir != null) {
diff --git a/streaming/src/main/scala/spark/streaming/WindowedDStream.scala b/streaming/src/main/scala/spark/streaming/WindowedDStream.scala
index ce89a3f99b..e4d2a634f5 100644
--- a/streaming/src/main/scala/spark/streaming/WindowedDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/WindowedDStream.scala
@@ -2,6 +2,7 @@ package spark.streaming
 
 import spark.RDD
 import spark.rdd.UnionRDD
+import spark.storage.StorageLevel
 
 
 class WindowedDStream[T: ClassManifest](
@@ -18,6 +19,8 @@ class WindowedDStream[T: ClassManifest](
     throw new Exception("The slide duration of WindowedDStream (" + _slideTime + ") " +
     "must be multiple of the slide duration of parent DStream (" + parent.slideTime + ")")
 
+  parent.persist(StorageLevel.MEMORY_ONLY_SER)
+
   def windowTime: Time =  _windowTime
 
   override def dependencies = List(parent)
-- 
cgit v1.2.3


From fd11d23bb3a817dabd414bceddebc35ad731f626 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Mon, 19 Nov 2012 19:04:39 -0800
Subject: Modified StreamingContext API to make constructor accept the batch
 size (since it is always needed, Patrick's suggestion). Added description to
 DStream and StreamingContext.

---
 .../src/main/scala/spark/streaming/DStream.scala   | 26 ++++++++++--
 .../main/scala/spark/streaming/DStreamGraph.scala  |  4 +-
 .../scala/spark/streaming/StreamingContext.scala   | 49 +++++++++++++++-------
 .../scala/spark/streaming/examples/CountRaw.scala  | 32 --------------
 .../spark/streaming/examples/FileStream.scala      |  7 ++--
 .../examples/FileStreamWithCheckpoint.scala        |  5 +--
 .../scala/spark/streaming/examples/GrepRaw.scala   |  7 ++--
 .../spark/streaming/examples/QueueStream.scala     | 10 ++---
 .../streaming/examples/TopKWordCountRaw.scala      |  7 ++--
 .../spark/streaming/examples/WordCountHdfs.scala   |  5 +--
 .../streaming/examples/WordCountNetwork.scala      |  6 +--
 .../spark/streaming/examples/WordCountRaw.scala    |  7 ++--
 .../examples/clickstream/PageViewStream.scala      |  5 +--
 .../spark/streaming/BasicOperationsSuite.scala     |  2 +-
 .../scala/spark/streaming/InputStreamsSuite.scala  | 12 ++----
 .../test/scala/spark/streaming/TestSuiteBase.scala |  6 +--
 16 files changed, 92 insertions(+), 98 deletions(-)
 delete mode 100644 streaming/src/main/scala/spark/streaming/examples/CountRaw.scala

(limited to 'streaming')

diff --git a/streaming/src/main/scala/spark/streaming/DStream.scala b/streaming/src/main/scala/spark/streaming/DStream.scala
index 26d5ce9198..8efda2074d 100644
--- a/streaming/src/main/scala/spark/streaming/DStream.scala
+++ b/streaming/src/main/scala/spark/streaming/DStream.scala
@@ -17,6 +17,26 @@ import java.io.{ObjectInputStream, IOException, ObjectOutputStream}
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.conf.Configuration
 
+/**
+ * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
+ * sequence of RDDs (of the same type) representing a continuous stream of data (see [[spark.RDD]]
+ * for more details on RDDs). DStreams can either be created from live data (such as, data from
+ * HDFS. Kafka or Flume) or it can be generated by transformation existing DStreams using operations
+ * such as `map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming program is running, each
+ * DStream periodically generates a RDD, either from live data or by transforming the RDD generated
+ * by a parent DStream.
+ *
+ * This class contains the basic operations available on all DStreams, such as `map`, `filter` and
+ * `window`. In addition, [[spark.streaming.PairDStreamFunctions]] contains operations available
+ * only on DStreams of key-value pairs, such as `groupByKeyAndWindow` and `join`. These operations
+ * are automatically available on any DStream of the right type (e.g., DStream[(Int, Int)] through
+ * implicit conversions when `spark.streaming.StreamingContext._` is imported.
+ *
+ * DStreams internally is characterized by a few basic properties:
+ *  - A list of other DStreams that the DStream depends on
+ *  - A time interval at which the DStream generates an RDD
+ *  - A function that is used to generate an RDD after each time interval
+ */
 abstract class DStream[T: ClassManifest] (@transient var ssc: StreamingContext)
 extends Serializable with Logging {
 
@@ -28,7 +48,7 @@ extends Serializable with Logging {
    * ---------------------------------------------- 
    */
 
-  // Time by which the window slides in this DStream
+  // Time interval at which the DStream generates an RDD
   def slideTime: Time 
 
   // List of parent DStreams on which this DStream depends on
@@ -186,12 +206,12 @@ extends Serializable with Logging {
     dependencies.foreach(_.setGraph(graph))
   }
 
-  protected[streaming] def setRememberDuration(duration: Time) {
+  protected[streaming] def remember(duration: Time) {
     if (duration != null && duration > rememberDuration) {
       rememberDuration = duration
       logInfo("Duration for remembering RDDs set to " + rememberDuration + " for " + this)
     }
-    dependencies.foreach(_.setRememberDuration(parentRememberDuration))
+    dependencies.foreach(_.remember(parentRememberDuration))
   }
 
   /** This method checks whether the 'time' is valid wrt slideTime for generating RDD */
diff --git a/streaming/src/main/scala/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/spark/streaming/DStreamGraph.scala
index bd8c033eab..d0a9ade61d 100644
--- a/streaming/src/main/scala/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/spark/streaming/DStreamGraph.scala
@@ -22,7 +22,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
       }
       zeroTime = time
       outputStreams.foreach(_.initialize(zeroTime))
-      outputStreams.foreach(_.setRememberDuration(rememberDuration))
+      outputStreams.foreach(_.remember(rememberDuration))
       outputStreams.foreach(_.validate)
       inputStreams.par.foreach(_.start())
     }
@@ -50,7 +50,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
     batchDuration = duration
   }
 
-  private[streaming] def setRememberDuration(duration: Time) {
+  private[streaming] def remember(duration: Time) {
     this.synchronized {
       if (rememberDuration != null) {
         throw new Exception("Batch duration already set as " + batchDuration +
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 7a9a71f303..4a41f2f516 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -18,19 +18,39 @@ import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 import org.apache.hadoop.fs.Path
 import java.util.UUID
 
-final class StreamingContext (
+/**
+ * A StreamingContext is the main entry point for Spark Streaming functionality. Besides the basic
+ * information (such as, cluster URL and job name) to internally create a SparkContext, it provides
+ * methods used to create DStream from various input sources.
+ */
+class StreamingContext private (
     sc_ : SparkContext,
-    cp_ : Checkpoint
+    cp_ : Checkpoint,
+    batchDur_ : Time
   ) extends Logging {
 
-  def this(sparkContext: SparkContext) = this(sparkContext, null)
-
-  def this(master: String, frameworkName: String, sparkHome: String = null, jars: Seq[String] = Nil) =
-    this(new SparkContext(master, frameworkName, sparkHome, jars), null)
+  /**
+   * Creates a StreamingContext using an existing SparkContext.
+   * @param sparkContext Existing SparkContext
+   * @param batchDuration The time interval at which streaming data will be divided into batches
+   */
+  def this(sparkContext: SparkContext, batchDuration: Time) = this(sparkContext, null, batchDuration)
 
-  def this(path: String) = this(null, CheckpointReader.read(path))
+  /**
+   * Creates a StreamingContext by providing the details necessary for creating a new SparkContext.
+   * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
+   * @param frameworkName A name for your job, to display on the cluster web UI
+   * @param batchDuration The time interval at which streaming data will be divided into batches
+   */
+  def this(master: String, frameworkName: String, batchDuration: Time) =
+    this(new SparkContext(master, frameworkName), null, batchDuration)
 
-  def this(cp_ : Checkpoint) = this(null, cp_)
+  /**
+   * Recreates the StreamingContext from a checkpoint file.
+   * @param path Path either to the directory that was specified as the checkpoint directory, or
+   *             to the checkpoint file 'graph' or 'graph.bk'.
+   */
+  def this(path: String) = this(null, CheckpointReader.read(path), null)
 
   initLogging()
 
@@ -57,7 +77,10 @@ final class StreamingContext (
       cp_.graph.restoreCheckpointData()
       cp_.graph
     } else {
-      new DStreamGraph()
+      assert(batchDur_ != null, "Batch duration for streaming context cannot be null")
+      val newGraph = new DStreamGraph()
+      newGraph.setBatchDuration(batchDur_)
+      newGraph
     }
   }
 
@@ -77,12 +100,8 @@ final class StreamingContext (
   private[streaming] var receiverJobThread: Thread = null
   private[streaming] var scheduler: Scheduler = null
 
-  def setBatchDuration(duration: Time) {
-    graph.setBatchDuration(duration)
-  }
-
-  def setRememberDuration(duration: Time) {
-    graph.setRememberDuration(duration)
+  def remember(duration: Time) {
+    graph.remember(duration)
   }
 
   def checkpoint(dir: String, interval: Time = null) {
diff --git a/streaming/src/main/scala/spark/streaming/examples/CountRaw.scala b/streaming/src/main/scala/spark/streaming/examples/CountRaw.scala
deleted file mode 100644
index d2fdabd659..0000000000
--- a/streaming/src/main/scala/spark/streaming/examples/CountRaw.scala
+++ /dev/null
@@ -1,32 +0,0 @@
-package spark.streaming.examples
-
-import spark.util.IntParam
-import spark.storage.StorageLevel
-import spark.streaming._
-import spark.streaming.StreamingContext._
-
-object CountRaw {
-  def main(args: Array[String]) {
-    if (args.length != 5) {
-      System.err.println("Usage: CountRaw <master> <numStreams> <host> <port> <batchMillis>")
-      System.exit(1)
-    }
-
-    val Array(master, IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args
-
-    // Create the context and set the batch size
-    val ssc = new StreamingContext(master, "CountRaw")
-    ssc.setBatchDuration(Milliseconds(batchMillis))
-
-    // Make sure some tasks have started on each node
-    ssc.sc.parallelize(1 to 1000, 1000).count()
-    ssc.sc.parallelize(1 to 1000, 1000).count()
-    ssc.sc.parallelize(1 to 1000, 1000).count()
-
-    val rawStreams = (1 to numStreams).map(_ =>
-      ssc.rawNetworkStream[String](host, port, StorageLevel.MEMORY_ONLY_2)).toArray
-    val union = new UnionDStream(rawStreams)
-    union.map(_.length + 2).reduce(_ + _).foreachRDD(r => println("Byte count: " + r.collect().mkString))
-    ssc.start()
-  }
-}
diff --git a/streaming/src/main/scala/spark/streaming/examples/FileStream.scala b/streaming/src/main/scala/spark/streaming/examples/FileStream.scala
index d68611abd6..81938d30d4 100644
--- a/streaming/src/main/scala/spark/streaming/examples/FileStream.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/FileStream.scala
@@ -14,10 +14,9 @@ object FileStream {
       System.exit(1)
     }
     
-    // Create the context and set the batch size
-    val ssc = new StreamingContext(args(0), "FileStream")
-    ssc.setBatchDuration(Seconds(2))
-    
+    // Create the context
+    val ssc = new StreamingContext(args(0), "FileStream", Seconds(1))
+
     // Create the new directory 
     val directory = new Path(args(1))
     val fs = directory.getFileSystem(new Configuration())
diff --git a/streaming/src/main/scala/spark/streaming/examples/FileStreamWithCheckpoint.scala b/streaming/src/main/scala/spark/streaming/examples/FileStreamWithCheckpoint.scala
index 21a83c0fde..b7bc15a1d5 100644
--- a/streaming/src/main/scala/spark/streaming/examples/FileStreamWithCheckpoint.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/FileStreamWithCheckpoint.scala
@@ -32,9 +32,8 @@ object FileStreamWithCheckpoint {
         if (!fs.exists(directory)) fs.mkdirs(directory)
 
         // Create new streaming context
-        val ssc_ = new StreamingContext(args(0), "FileStreamWithCheckpoint")
-        ssc_.setBatchDuration(Seconds(1))
-        ssc_.checkpoint(checkpointDir, Seconds(1))
+        val ssc_ = new StreamingContext(args(0), "FileStreamWithCheckpoint", Seconds(1))
+        ssc_.checkpoint(checkpointDir)
 
         // Setup the streaming computation
         val inputStream = ssc_.textFileStream(directory.toString)
diff --git a/streaming/src/main/scala/spark/streaming/examples/GrepRaw.scala b/streaming/src/main/scala/spark/streaming/examples/GrepRaw.scala
index ffbea6e55d..6cb2b4c042 100644
--- a/streaming/src/main/scala/spark/streaming/examples/GrepRaw.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/GrepRaw.scala
@@ -16,9 +16,10 @@ object GrepRaw {
 
     val Array(master, IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args
 
-    // Create the context and set the batch size
-    val ssc = new StreamingContext(master, "GrepRaw")
-    ssc.setBatchDuration(Milliseconds(batchMillis))
+    // Create the context
+    val ssc = new StreamingContext(master, "GrepRaw", Milliseconds(batchMillis))
+
+    // Warm up the JVMs on master and slave for JIT compilation to kick in
     warmUp(ssc.sc)
 
 
diff --git a/streaming/src/main/scala/spark/streaming/examples/QueueStream.scala b/streaming/src/main/scala/spark/streaming/examples/QueueStream.scala
index 2af51bad28..2a265d021d 100644
--- a/streaming/src/main/scala/spark/streaming/examples/QueueStream.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/QueueStream.scala
@@ -1,9 +1,8 @@
 package spark.streaming.examples
 
 import spark.RDD
-import spark.streaming.StreamingContext
+import spark.streaming.{Seconds, StreamingContext}
 import spark.streaming.StreamingContext._
-import spark.streaming.Seconds
 
 import scala.collection.mutable.SynchronizedQueue
 
@@ -15,10 +14,9 @@ object QueueStream {
       System.exit(1)
     }
     
-    // Create the context and set the batch size
-    val ssc = new StreamingContext(args(0), "QueueStream")
-    ssc.setBatchDuration(Seconds(1))
-    
+    // Create the context
+    val ssc = new StreamingContext(args(0), "QueueStream", Seconds(1))
+
     // Create the queue through which RDDs can be pushed to 
     // a QueueInputDStream
     val rddQueue = new SynchronizedQueue[RDD[Int]]()
diff --git a/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala b/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala
index 0411bde1a7..fe4c2bf155 100644
--- a/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/TopKWordCountRaw.scala
@@ -20,12 +20,11 @@ object TopKWordCountRaw {
     val Array(master, IntParam(numStreams), IntParam(port), checkpointDir) = args
     val k = 10
     
-    // Create the context, set the batch size and checkpoint directory.
+    // Create the context, and set the checkpoint directory.
     // Checkpoint directory is necessary for achieving fault-tolerance, by saving counts 
     // periodically to HDFS 
-    val ssc = new StreamingContext(master, "TopKWordCountRaw")
-    ssc.setBatchDuration(Seconds(1))
-    ssc.checkpoint(checkpointDir + "/" + UUID.randomUUID.toString, Seconds(1)) 
+    val ssc = new StreamingContext(master, "TopKWordCountRaw", Seconds(1))
+    ssc.checkpoint(checkpointDir + "/" + UUID.randomUUID.toString, Seconds(1))
    
     // Warm up the JVMs on master and slave for JIT compilation to kick in  
     /*warmUp(ssc.sc)*/
diff --git a/streaming/src/main/scala/spark/streaming/examples/WordCountHdfs.scala b/streaming/src/main/scala/spark/streaming/examples/WordCountHdfs.scala
index 591cb141c3..867a8f42c4 100644
--- a/streaming/src/main/scala/spark/streaming/examples/WordCountHdfs.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/WordCountHdfs.scala
@@ -10,9 +10,8 @@ object WordCountHdfs {
       System.exit(1)
     }
 
-    // Create the context and set the batch size
-    val ssc = new StreamingContext(args(0), "WordCountHdfs")
-    ssc.setBatchDuration(Seconds(2))
+    // Create the context
+    val ssc = new StreamingContext(args(0), "WordCountHdfs", Seconds(2))
 
     // Create the FileInputDStream on the directory and use the
     // stream to count words in new files created
diff --git a/streaming/src/main/scala/spark/streaming/examples/WordCountNetwork.scala b/streaming/src/main/scala/spark/streaming/examples/WordCountNetwork.scala
index ba1bd1de7c..eadda60563 100644
--- a/streaming/src/main/scala/spark/streaming/examples/WordCountNetwork.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/WordCountNetwork.scala
@@ -6,13 +6,13 @@ import spark.streaming.StreamingContext._
 object WordCountNetwork {
   def main(args: Array[String]) {
     if (args.length < 2) {
-      System.err.println("Usage: WordCountNetwork <master> <hostname> <port>")
+      System.err.println("Usage: WordCountNetwork <master> <hostname> <port>\n" +
+        "In local mode, <master> should be 'local[n]' with n > 1")
       System.exit(1)
     }
 
     // Create the context and set the batch size
-    val ssc = new StreamingContext(args(0), "WordCountNetwork")
-    ssc.setBatchDuration(Seconds(2))
+    val ssc = new StreamingContext(args(0), "WordCountNetwork", Seconds(1))
 
     // Create a NetworkInputDStream on target ip:port and count the
     // words in input stream of \n delimited test (eg. generated by 'nc') 
diff --git a/streaming/src/main/scala/spark/streaming/examples/WordCountRaw.scala b/streaming/src/main/scala/spark/streaming/examples/WordCountRaw.scala
index 571428c0fe..a29c81d437 100644
--- a/streaming/src/main/scala/spark/streaming/examples/WordCountRaw.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/WordCountRaw.scala
@@ -19,12 +19,11 @@ object WordCountRaw {
 
     val Array(master, IntParam(numStreams), IntParam(port), checkpointDir) = args
 
-    // Create the context, set the batch size and checkpoint directory.
+    // Create the context, and set the checkpoint directory.
     // Checkpoint directory is necessary for achieving fault-tolerance, by saving counts 
     // periodically to HDFS 
-    val ssc = new StreamingContext(master, "WordCountRaw")
-    ssc.setBatchDuration(Seconds(1))
-    ssc.checkpoint(checkpointDir + "/" + UUID.randomUUID.toString, Seconds(1)) 
+    val ssc = new StreamingContext(master, "WordCountRaw", Seconds(1))
+    ssc.checkpoint(checkpointDir + "/" + UUID.randomUUID.toString, Seconds(1))
    
     // Warm up the JVMs on master and slave for JIT compilation to kick in  
     warmUp(ssc.sc)
diff --git a/streaming/src/main/scala/spark/streaming/examples/clickstream/PageViewStream.scala b/streaming/src/main/scala/spark/streaming/examples/clickstream/PageViewStream.scala
index 1a51fb66cd..68be6b7893 100644
--- a/streaming/src/main/scala/spark/streaming/examples/clickstream/PageViewStream.scala
+++ b/streaming/src/main/scala/spark/streaming/examples/clickstream/PageViewStream.scala
@@ -23,9 +23,8 @@ object PageViewStream {
     val host = args(1)
     val port = args(2).toInt
 
-    // Create the context and set the batch size
-    val ssc = new StreamingContext("local[2]", "PageViewStream")
-    ssc.setBatchDuration(Seconds(1))
+    // Create the context
+    val ssc = new StreamingContext("local[2]", "PageViewStream", Seconds(1))
 
     // Create a NetworkInputDStream on target host:port and convert each line to a PageView
     val pageViews = ssc.networkTextStream(host, port)
diff --git a/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
index d0aaac0f2e..dc38ef4912 100644
--- a/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
@@ -175,7 +175,7 @@ class BasicOperationsSuite extends TestSuiteBase {
     }
 
     val ssc = setupStreams(input, operation _)
-    ssc.setRememberDuration(rememberDuration)
+    ssc.remember(rememberDuration)
     runStreams[(Int, Int)](ssc, input.size, input.size / 2)
 
     val windowedStream2 = ssc.graph.getOutputStreams().head.dependencies.head
diff --git a/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
index 3e99440226..e98c096725 100644
--- a/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
@@ -40,8 +40,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     testServer.start()
 
     // Set up the streaming context and input streams
-    val ssc = new StreamingContext(master, framework)
-    ssc.setBatchDuration(batchDuration)
+    val ssc = new StreamingContext(master, framework, batchDuration)
     val networkStream = ssc.networkTextStream("localhost", testPort, StorageLevel.MEMORY_AND_DISK)
     val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String  ]]
     val outputStream = new TestOutputStream(networkStream, outputBuffer)
@@ -89,8 +88,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     testServer.start()
 
     // Set up the streaming context and input streams
-    var ssc = new StreamingContext(master, framework)
-    ssc.setBatchDuration(batchDuration)
+    var ssc = new StreamingContext(master, framework, batchDuration)
     ssc.checkpoint(checkpointDir, checkpointInterval)
     val networkStream = ssc.networkTextStream("localhost", testPort, StorageLevel.MEMORY_AND_DISK)
     var outputStream = new TestOutputStream(networkStream, new ArrayBuffer[Seq[String]])
@@ -137,8 +135,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     }
 
     // Set up the streaming context and input streams
-    val ssc = new StreamingContext(master, framework)
-    ssc.setBatchDuration(batchDuration)
+    val ssc = new StreamingContext(master, framework, batchDuration)
     val filestream = ssc.textFileStream(testDir.toString)
     val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
     def output = outputBuffer.flatMap(x => x)
@@ -198,8 +195,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     }
 
     // Set up the streaming context and input streams
-    var ssc = new StreamingContext(master, framework)
-    ssc.setBatchDuration(batchDuration)
+    var ssc = new StreamingContext(master, framework, batchDuration)
     ssc.checkpoint(checkpointDir, checkpointInterval)
     val filestream = ssc.textFileStream(testDir.toString)
     var outputStream = new TestOutputStream(filestream, new ArrayBuffer[Seq[String]])
diff --git a/streaming/src/test/scala/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/spark/streaming/TestSuiteBase.scala
index 5fb5cc504c..8cc2f8ccfc 100644
--- a/streaming/src/test/scala/spark/streaming/TestSuiteBase.scala
+++ b/streaming/src/test/scala/spark/streaming/TestSuiteBase.scala
@@ -76,8 +76,7 @@ trait TestSuiteBase extends FunSuite with Logging {
     ): StreamingContext = {
 
     // Create StreamingContext
-    val ssc = new StreamingContext(master, framework)
-    ssc.setBatchDuration(batchDuration)
+    val ssc = new StreamingContext(master, framework, batchDuration)
     if (checkpointDir != null) {
       ssc.checkpoint(checkpointDir, checkpointInterval)
     }
@@ -98,8 +97,7 @@ trait TestSuiteBase extends FunSuite with Logging {
     ): StreamingContext = {
 
     // Create StreamingContext
-    val ssc = new StreamingContext(master, framework)
-    ssc.setBatchDuration(batchDuration)
+    val ssc = new StreamingContext(master, framework, batchDuration)
     if (checkpointDir != null) {
       ssc.checkpoint(checkpointDir, checkpointInterval)
     }
-- 
cgit v1.2.3


From b18d70870a33a4783c6b3b787bef9b0eec30bce0 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Tue, 27 Nov 2012 15:08:49 -0800
Subject: Modified bunch HashMaps in Spark to use TimeStampedHashMap and made
 various modules use CleanupTask to periodically clean up metadata.

---
 core/src/main/scala/spark/CacheTracker.scala       |  6 +-
 core/src/main/scala/spark/MapOutputTracker.scala   | 27 ++++---
 .../main/scala/spark/scheduler/DAGScheduler.scala  | 13 +++-
 .../scala/spark/scheduler/ShuffleMapTask.scala     |  6 +-
 core/src/main/scala/spark/util/CleanupTask.scala   | 31 ++++++++
 .../main/scala/spark/util/TimeStampedHashMap.scala | 87 ++++++++++++++++++++++
 .../scala/spark/streaming/StreamingContext.scala   | 13 +++-
 7 files changed, 165 insertions(+), 18 deletions(-)
 create mode 100644 core/src/main/scala/spark/util/CleanupTask.scala
 create mode 100644 core/src/main/scala/spark/util/TimeStampedHashMap.scala

(limited to 'streaming')

diff --git a/core/src/main/scala/spark/CacheTracker.scala b/core/src/main/scala/spark/CacheTracker.scala
index c5db6ce63a..0ee59bee0f 100644
--- a/core/src/main/scala/spark/CacheTracker.scala
+++ b/core/src/main/scala/spark/CacheTracker.scala
@@ -14,6 +14,7 @@ import scala.collection.mutable.HashSet
 
 import spark.storage.BlockManager
 import spark.storage.StorageLevel
+import util.{CleanupTask, TimeStampedHashMap}
 
 private[spark] sealed trait CacheTrackerMessage
 
@@ -30,7 +31,7 @@ private[spark] case object StopCacheTracker extends CacheTrackerMessage
 
 private[spark] class CacheTrackerActor extends Actor with Logging {
   // TODO: Should probably store (String, CacheType) tuples
-  private val locs = new HashMap[Int, Array[List[String]]]
+  private val locs = new TimeStampedHashMap[Int, Array[List[String]]]
 
   /**
    * A map from the slave's host name to its cache size.
@@ -38,6 +39,8 @@ private[spark] class CacheTrackerActor extends Actor with Logging {
   private val slaveCapacity = new HashMap[String, Long]
   private val slaveUsage = new HashMap[String, Long]
 
+  private val cleanupTask = new CleanupTask("CacheTracker", locs.cleanup)
+
   private def getCacheUsage(host: String): Long = slaveUsage.getOrElse(host, 0L)
   private def getCacheCapacity(host: String): Long = slaveCapacity.getOrElse(host, 0L)
   private def getCacheAvailable(host: String): Long = getCacheCapacity(host) - getCacheUsage(host)
@@ -86,6 +89,7 @@ private[spark] class CacheTrackerActor extends Actor with Logging {
     case StopCacheTracker =>
       logInfo("Stopping CacheTrackerActor")
       sender ! true
+      cleanupTask.cancel()
       context.stop(self)
   }
 }
diff --git a/core/src/main/scala/spark/MapOutputTracker.scala b/core/src/main/scala/spark/MapOutputTracker.scala
index 45441aa5e5..d0be1bb913 100644
--- a/core/src/main/scala/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/spark/MapOutputTracker.scala
@@ -17,6 +17,7 @@ import scala.collection.mutable.HashSet
 import scheduler.MapStatus
 import spark.storage.BlockManagerId
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
+import util.{CleanupTask, TimeStampedHashMap}
 
 private[spark] sealed trait MapOutputTrackerMessage
 private[spark] case class GetMapOutputStatuses(shuffleId: Int, requester: String)
@@ -43,7 +44,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
 
   val timeout = 10.seconds
 
-  var mapStatuses = new ConcurrentHashMap[Int, Array[MapStatus]]
+  var mapStatuses = new TimeStampedHashMap[Int, Array[MapStatus]]
 
   // Incremented every time a fetch fails so that client nodes know to clear
   // their cache of map output locations if this happens.
@@ -52,7 +53,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
 
   // Cache a serialized version of the output statuses for each shuffle to send them out faster
   var cacheGeneration = generation
-  val cachedSerializedStatuses = new HashMap[Int, Array[Byte]]
+  val cachedSerializedStatuses = new TimeStampedHashMap[Int, Array[Byte]]
 
   var trackerActor: ActorRef = if (isMaster) {
     val actor = actorSystem.actorOf(Props(new MapOutputTrackerActor(this)), name = actorName)
@@ -63,6 +64,8 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
     actorSystem.actorFor(url)
   }
 
+  val cleanupTask = new CleanupTask("MapOutputTracker", this.cleanup)
+
   // Send a message to the trackerActor and get its result within a default timeout, or
   // throw a SparkException if this fails.
   def askTracker(message: Any): Any = {
@@ -83,14 +86,14 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
   }
 
   def registerShuffle(shuffleId: Int, numMaps: Int) {
-    if (mapStatuses.get(shuffleId) != null) {
+    if (mapStatuses.get(shuffleId) != None) {
       throw new IllegalArgumentException("Shuffle ID " + shuffleId + " registered twice")
     }
     mapStatuses.put(shuffleId, new Array[MapStatus](numMaps))
   }
   
   def registerMapOutput(shuffleId: Int, mapId: Int, status: MapStatus) {
-    var array = mapStatuses.get(shuffleId)
+    var array = mapStatuses(shuffleId)
     array.synchronized {
       array(mapId) = status
     }
@@ -107,7 +110,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
   }
 
   def unregisterMapOutput(shuffleId: Int, mapId: Int, bmAddress: BlockManagerId) {
-    var array = mapStatuses.get(shuffleId)
+    var array = mapStatuses(shuffleId)
     if (array != null) {
       array.synchronized {
         if (array(mapId).address == bmAddress) {
@@ -125,7 +128,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
   
   // Called on possibly remote nodes to get the server URIs and output sizes for a given shuffle
   def getServerStatuses(shuffleId: Int, reduceId: Int): Array[(BlockManagerId, Long)] = {
-    val statuses = mapStatuses.get(shuffleId)
+    val statuses = mapStatuses.get(shuffleId).orNull
     if (statuses == null) {
       logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them")
       fetching.synchronized {
@@ -138,7 +141,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
               case e: InterruptedException =>
             }
           }
-          return mapStatuses.get(shuffleId).map(status =>
+          return mapStatuses(shuffleId).map(status =>
             (status.address, MapOutputTracker.decompressSize(status.compressedSizes(reduceId))))
         } else {
           fetching += shuffleId
@@ -164,9 +167,15 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
     }
   }
 
+  def cleanup(cleanupTime: Long) {
+    mapStatuses.cleanup(cleanupTime)
+    cachedSerializedStatuses.cleanup(cleanupTime)
+  }
+
   def stop() {
     communicate(StopMapOutputTracker)
     mapStatuses.clear()
+    cleanupTask.cancel()
     trackerActor = null
   }
 
@@ -192,7 +201,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
     generationLock.synchronized {
       if (newGen > generation) {
         logInfo("Updating generation to " + newGen + " and clearing cache")
-        mapStatuses = new ConcurrentHashMap[Int, Array[MapStatus]]
+        mapStatuses = new TimeStampedHashMap[Int, Array[MapStatus]]
         generation = newGen
       }
     }
@@ -210,7 +219,7 @@ private[spark] class MapOutputTracker(actorSystem: ActorSystem, isMaster: Boolea
         case Some(bytes) =>
           return bytes
         case None =>
-          statuses = mapStatuses.get(shuffleId)
+          statuses = mapStatuses(shuffleId)
           generationGotten = generation
       }
     }
diff --git a/core/src/main/scala/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/spark/scheduler/DAGScheduler.scala
index aaaed59c4a..3af877b817 100644
--- a/core/src/main/scala/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/DAGScheduler.scala
@@ -14,6 +14,7 @@ import spark.partial.ApproximateEvaluator
 import spark.partial.PartialResult
 import spark.storage.BlockManagerMaster
 import spark.storage.BlockManagerId
+import util.{CleanupTask, TimeStampedHashMap}
 
 /**
  * A Scheduler subclass that implements stage-oriented scheduling. It computes a DAG of stages for 
@@ -61,9 +62,9 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
 
   val nextStageId = new AtomicInteger(0)
 
-  val idToStage = new HashMap[Int, Stage]
+  val idToStage = new TimeStampedHashMap[Int, Stage]
 
-  val shuffleToMapStage = new HashMap[Int, Stage]
+  val shuffleToMapStage = new TimeStampedHashMap[Int, Stage]
 
   var cacheLocs = new HashMap[Int, Array[List[String]]]
 
@@ -83,6 +84,8 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
   val activeJobs = new HashSet[ActiveJob]
   val resultStageToJob = new HashMap[Stage, ActiveJob]
 
+  val cleanupTask = new CleanupTask("DAGScheduler", this.cleanup)
+
   // Start a thread to run the DAGScheduler event loop
   new Thread("DAGScheduler") {
     setDaemon(true)
@@ -591,8 +594,14 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
     return Nil
   }
 
+  def cleanup(cleanupTime: Long) {
+    idToStage.cleanup(cleanupTime)
+    shuffleToMapStage.cleanup(cleanupTime)
+  }
+
   def stop() {
     eventQueue.put(StopDAGScheduler)
+    cleanupTask.cancel()
     taskSched.stop()
   }
 }
diff --git a/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
index 60105c42b6..fbf618c906 100644
--- a/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala
@@ -14,17 +14,19 @@ import com.ning.compress.lzf.LZFOutputStream
 
 import spark._
 import spark.storage._
+import util.{TimeStampedHashMap, CleanupTask}
 
 private[spark] object ShuffleMapTask {
 
   // A simple map between the stage id to the serialized byte array of a task.
   // Served as a cache for task serialization because serialization can be
   // expensive on the master node if it needs to launch thousands of tasks.
-  val serializedInfoCache = new JHashMap[Int, Array[Byte]]
+  val serializedInfoCache = new TimeStampedHashMap[Int, Array[Byte]]
+  val cleanupTask = new CleanupTask("ShuffleMapTask", serializedInfoCache.cleanup)
 
   def serializeInfo(stageId: Int, rdd: RDD[_], dep: ShuffleDependency[_,_]): Array[Byte] = {
     synchronized {
-      val old = serializedInfoCache.get(stageId)
+      val old = serializedInfoCache.get(stageId).orNull
       if (old != null) {
         return old
       } else {
diff --git a/core/src/main/scala/spark/util/CleanupTask.scala b/core/src/main/scala/spark/util/CleanupTask.scala
new file mode 100644
index 0000000000..ccc28803e0
--- /dev/null
+++ b/core/src/main/scala/spark/util/CleanupTask.scala
@@ -0,0 +1,31 @@
+package spark.util
+
+import java.util.concurrent.{TimeUnit, ScheduledFuture, Executors}
+import java.util.{TimerTask, Timer}
+import spark.Logging
+
+class CleanupTask(name: String, cleanupFunc: (Long) => Unit) extends Logging {
+  val delayMins = System.getProperty("spark.cleanup.delay", "-100").toInt
+  val periodMins = System.getProperty("spark.cleanup.period", (delayMins / 10).toString).toInt
+  val timer = new Timer(name + " cleanup timer", true)
+  val task = new TimerTask {
+    def run() {
+      try {
+        if (delayMins > 0) {
+
+          cleanupFunc(System.currentTimeMillis() - (delayMins * 60 * 1000))
+          logInfo("Ran cleanup task for " + name)
+        }
+      } catch {
+        case e: Exception => logError("Error running cleanup task for " + name, e)
+      }
+    }
+  }
+  if (periodMins > 0) {
+    timer.schedule(task, periodMins * 60 * 1000, periodMins * 60 * 1000)
+  }
+
+  def cancel() {
+    timer.cancel()
+  }
+}
diff --git a/core/src/main/scala/spark/util/TimeStampedHashMap.scala b/core/src/main/scala/spark/util/TimeStampedHashMap.scala
new file mode 100644
index 0000000000..7a22b80a20
--- /dev/null
+++ b/core/src/main/scala/spark/util/TimeStampedHashMap.scala
@@ -0,0 +1,87 @@
+package spark.util
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.{HashMap, Map}
+import java.util.concurrent.ConcurrentHashMap
+
+/**
+ * This is a custom implementation of scala.collection.mutable.Map which stores the insertion
+ * time stamp along with each key-value pair. Key-value pairs that are older than a particular
+ * threshold time can them be removed using the cleanup method. This is intended to be a drop-in
+ * replacement of scala.collection.mutable.HashMap.
+ */
+class TimeStampedHashMap[A, B] extends Map[A, B]() {
+  val internalMap = new ConcurrentHashMap[A, (B, Long)]()
+
+  def get(key: A): Option[B] = {
+    val value = internalMap.get(key)
+    if (value != null) Some(value._1) else None
+  }
+
+  def iterator: Iterator[(A, B)] = {
+    val jIterator = internalMap.entrySet().iterator()
+    jIterator.map(kv => (kv.getKey, kv.getValue._1))
+  }
+
+  override def + [B1 >: B](kv: (A, B1)): Map[A, B1] = {
+    val newMap = new TimeStampedHashMap[A, B1]
+    newMap.internalMap.putAll(this.internalMap)
+    newMap.internalMap.put(kv._1, (kv._2, currentTime))
+    newMap
+  }
+
+  override def - (key: A): Map[A, B] = {
+    internalMap.remove(key)
+    this
+  }
+
+  override def += (kv: (A, B)): this.type = {
+    internalMap.put(kv._1, (kv._2, currentTime))
+    this
+  }
+
+  override def -= (key: A): this.type = {
+    internalMap.remove(key)
+    this
+  }
+
+  override def update(key: A, value: B) {
+    this += ((key, value))
+  }
+
+  override def apply(key: A): B = {
+    val value = internalMap.get(key)
+    if (value == null) throw new NoSuchElementException()
+    value._1
+  }
+
+  override def filter(p: ((A, B)) => Boolean): Map[A, B] = {
+    internalMap.map(kv => (kv._1, kv._2._1)).filter(p)
+  }
+
+  override def empty: Map[A, B] = new TimeStampedHashMap[A, B]()
+
+  override def size(): Int = internalMap.size()
+
+  override def foreach[U](f: ((A, B)) => U): Unit = {
+    val iterator = internalMap.entrySet().iterator()
+    while(iterator.hasNext) {
+      val entry = iterator.next()
+      val kv = (entry.getKey, entry.getValue._1)
+      f(kv)
+    }
+  }
+
+  def cleanup(threshTime: Long) {
+    val iterator = internalMap.entrySet().iterator()
+    while(iterator.hasNext) {
+      val entry = iterator.next()
+      if (entry.getValue._2 < threshTime) {
+        iterator.remove()
+      }
+    }
+  }
+
+  private def currentTime: Long = System.currentTimeMillis()
+
+}
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 4a41f2f516..58123dc82c 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -43,7 +43,7 @@ class StreamingContext private (
    * @param batchDuration The time interval at which streaming data will be divided into batches
    */
   def this(master: String, frameworkName: String, batchDuration: Time) =
-    this(new SparkContext(master, frameworkName), null, batchDuration)
+    this(StreamingContext.createNewSparkContext(master, frameworkName), null, batchDuration)
 
   /**
    * Recreates the StreamingContext from a checkpoint file.
@@ -214,11 +214,8 @@ class StreamingContext private (
       "Checkpoint directory has been set, but the graph checkpointing interval has " +
         "not been set. Please use StreamingContext.checkpoint() to set the interval."
     )
-
-
   }
 
-
   /**
    * This function starts the execution of the streams.
    */
@@ -265,6 +262,14 @@ class StreamingContext private (
 
 
 object StreamingContext {
+
+  def createNewSparkContext(master: String, frameworkName: String): SparkContext = {
+    if (System.getProperty("spark.cleanup.delay", "-1").toInt < 0) {
+      System.setProperty("spark.cleanup.delay", "60")
+    }
+    new SparkContext(master, frameworkName)
+  }
+
   implicit def toPairDStreamFunctions[K: ClassManifest, V: ClassManifest](stream: DStream[(K,V)]) = {
     new PairDStreamFunctions[K, V](stream)
   }
-- 
cgit v1.2.3


From d5e7aad039603a8a02d11f9ebda001422ca4c341 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 28 Nov 2012 08:36:55 +0000
Subject: Bug fixes

---
 core/src/main/scala/spark/scheduler/DAGScheduler.scala  | 11 ++++++++++-
 core/src/main/scala/spark/util/CleanupTask.scala        | 17 +++++++++--------
 .../main/scala/spark/streaming/StreamingContext.scala   |  2 +-
 3 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'streaming')

diff --git a/core/src/main/scala/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/spark/scheduler/DAGScheduler.scala
index 3af877b817..affacb43ca 100644
--- a/core/src/main/scala/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/DAGScheduler.scala
@@ -78,7 +78,7 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
   val waiting = new HashSet[Stage] // Stages we need to run whose parents aren't done
   val running = new HashSet[Stage] // Stages we are running right now
   val failed = new HashSet[Stage]  // Stages that must be resubmitted due to fetch failures
-  val pendingTasks = new HashMap[Stage, HashSet[Task[_]]] // Missing tasks from each stage
+  val pendingTasks = new TimeStampedHashMap[Stage, HashSet[Task[_]]] // Missing tasks from each stage
   var lastFetchFailureTime: Long = 0  // Used to wait a bit to avoid repeated resubmits
 
   val activeJobs = new HashSet[ActiveJob]
@@ -595,8 +595,17 @@ class DAGScheduler(taskSched: TaskScheduler) extends TaskSchedulerListener with
   }
 
   def cleanup(cleanupTime: Long) {
+    var sizeBefore = idToStage.size
     idToStage.cleanup(cleanupTime)
+    logInfo("idToStage " + sizeBefore + " --> " + idToStage.size)
+
+    sizeBefore = shuffleToMapStage.size
     shuffleToMapStage.cleanup(cleanupTime)
+    logInfo("shuffleToMapStage " + sizeBefore + " --> " + shuffleToMapStage.size)
+    
+    sizeBefore = pendingTasks.size
+    pendingTasks.cleanup(cleanupTime)
+    logInfo("pendingTasks " + sizeBefore + " --> " + pendingTasks.size)
   }
 
   def stop() {
diff --git a/core/src/main/scala/spark/util/CleanupTask.scala b/core/src/main/scala/spark/util/CleanupTask.scala
index ccc28803e0..a4357c62c6 100644
--- a/core/src/main/scala/spark/util/CleanupTask.scala
+++ b/core/src/main/scala/spark/util/CleanupTask.scala
@@ -5,24 +5,25 @@ import java.util.{TimerTask, Timer}
 import spark.Logging
 
 class CleanupTask(name: String, cleanupFunc: (Long) => Unit) extends Logging {
-  val delayMins = System.getProperty("spark.cleanup.delay", "-100").toInt
-  val periodMins = System.getProperty("spark.cleanup.period", (delayMins / 10).toString).toInt
+  val delaySeconds = (System.getProperty("spark.cleanup.delay", "-100").toDouble * 60).toInt
+  val periodSeconds = math.max(10, delaySeconds / 10)
   val timer = new Timer(name + " cleanup timer", true)
   val task = new TimerTask {
     def run() {
       try {
-        if (delayMins > 0) {
-
-          cleanupFunc(System.currentTimeMillis() - (delayMins * 60 * 1000))
+        if (delaySeconds > 0) {
+          cleanupFunc(System.currentTimeMillis() - (delaySeconds * 1000))
           logInfo("Ran cleanup task for " + name)
-        }
+        } 
       } catch {
         case e: Exception => logError("Error running cleanup task for " + name, e)
       }
     }
   }
-  if (periodMins > 0) {
-    timer.schedule(task, periodMins * 60 * 1000, periodMins * 60 * 1000)
+  if (periodSeconds > 0) {
+    logInfo("Starting cleanup task for " + name + " with delay of " + delaySeconds + " seconds and "
+      + "period of " + periodSeconds + " secs")
+    timer.schedule(task, periodSeconds * 1000, periodSeconds * 1000)
   }
 
   def cancel() {
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 58123dc82c..90dd560752 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -264,7 +264,7 @@ class StreamingContext private (
 object StreamingContext {
 
   def createNewSparkContext(master: String, frameworkName: String): SparkContext = {
-    if (System.getProperty("spark.cleanup.delay", "-1").toInt < 0) {
+    if (System.getProperty("spark.cleanup.delay", "-1").toDouble < 0) {
       System.setProperty("spark.cleanup.delay", "60")
     }
     new SparkContext(master, frameworkName)
-- 
cgit v1.2.3


From 62965c5d8e3f4f0246ac2c8814ac75ea82b3f238 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sat, 1 Dec 2012 08:26:10 -0800
Subject: Added ssc.union

---
 streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala | 3 ++-
 streaming/src/main/scala/spark/streaming/StreamingContext.scala       | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'streaming')

diff --git a/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala b/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala
index 8b484e6acf..bb852cbcca 100644
--- a/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala
@@ -118,7 +118,8 @@ class ReducedWindowedDStream[K: ClassManifest, V: ClassManifest](
       if (seqOfValues(0).isEmpty) {
         // If previous window's reduce value does not exist, then at least new values should exist
         if (newValues.isEmpty) {
-          throw new Exception("Neither previous window has value for key, nor new values found")
+          val info = "seqOfValues =\n" + seqOfValues.map(x => "[" + x.mkString(",") + "]").mkString("\n")
+          throw new Exception("Neither previous window has value for key, nor new values found\n" + info)
         }
         // Reduce the new values
         newValues.reduce(reduceF) // return
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 90dd560752..63d8766749 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -189,6 +189,10 @@ class StreamingContext private (
     inputStream
   }
 
+  def union[T: ClassManifest](streams: Seq[DStream[T]]): DStream[T] = {
+    new UnionDStream[T](streams.toArray)
+  }
+
   /**
    * This function registers a InputDStream as an input stream that will be
    * started (InputDStream.start() called) to get the input data streams.
-- 
cgit v1.2.3


From 477de94894b7d8eeed281d33c12bcb2269d117c7 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sat, 1 Dec 2012 13:15:06 -0800
Subject: Minor modifications.

---
 core/src/main/scala/spark/util/MetadataCleaner.scala      |  7 ++++++-
 streaming/src/main/scala/spark/streaming/DStream.scala    | 15 ++++++++++++++-
 .../scala/spark/streaming/ReducedWindowedDStream.scala    |  4 ++--
 .../src/main/scala/spark/streaming/StreamingContext.scala |  8 ++++++--
 4 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'streaming')

diff --git a/core/src/main/scala/spark/util/MetadataCleaner.scala b/core/src/main/scala/spark/util/MetadataCleaner.scala
index 71ac39864e..2541b26255 100644
--- a/core/src/main/scala/spark/util/MetadataCleaner.scala
+++ b/core/src/main/scala/spark/util/MetadataCleaner.scala
@@ -5,7 +5,7 @@ import java.util.{TimerTask, Timer}
 import spark.Logging
 
 class MetadataCleaner(name: String, cleanupFunc: (Long) => Unit) extends Logging {
-  val delaySeconds = (System.getProperty("spark.cleanup.delay", "-100").toDouble * 60).toInt
+  val delaySeconds = MetadataCleaner.getDelaySeconds
   val periodSeconds = math.max(10, delaySeconds / 10)
   val timer = new Timer(name + " cleanup timer", true)
   val task = new TimerTask {
@@ -30,3 +30,8 @@ class MetadataCleaner(name: String, cleanupFunc: (Long) => Unit) extends Logging
     timer.cancel()
   }
 }
+
+object MetadataCleaner {
+  def getDelaySeconds = (System.getProperty("spark.cleaner.delay", "-100").toDouble * 60).toInt
+  def setDelaySeconds(delay: Long) { System.setProperty("spark.cleaner.delay", delay.toString) }
+}
diff --git a/streaming/src/main/scala/spark/streaming/DStream.scala b/streaming/src/main/scala/spark/streaming/DStream.scala
index 8efda2074d..28a3e2dfc7 100644
--- a/streaming/src/main/scala/spark/streaming/DStream.scala
+++ b/streaming/src/main/scala/spark/streaming/DStream.scala
@@ -146,6 +146,8 @@ extends Serializable with Logging {
   }
 
   protected[streaming] def validate() {
+    assert(rememberDuration != null, "Remember duration is set to null")
+
     assert(
       !mustCheckpoint || checkpointInterval != null,
       "The checkpoint interval for " + this.getClass.getSimpleName + " has not been set. " +
@@ -180,13 +182,24 @@ extends Serializable with Logging {
         checkpointInterval + "). Please set it to higher than " + checkpointInterval + "."
     )
 
+    val metadataCleanupDelay = System.getProperty("spark.cleanup.delay", "-1").toDouble
+    assert(
+      metadataCleanupDelay < 0 || rememberDuration < metadataCleanupDelay * 60 * 1000,
+      "It seems you are doing some DStream window operation or setting a checkpoint interval " +
+        "which requires " + this.getClass.getSimpleName + " to remember generated RDDs for more " +
+        "than " + rememberDuration.milliseconds + " milliseconds. But the Spark's metadata cleanup" +
+        "delay is set to " + metadataCleanupDelay + " minutes, which is not sufficient. Please set " +
+        "the Java property 'spark.cleanup.delay' to more than " +
+        math.ceil(rememberDuration.millis.toDouble / 60000.0).toInt + " minutes."
+    )
+
     dependencies.foreach(_.validate())
 
     logInfo("Slide time = " + slideTime)
     logInfo("Storage level = " + storageLevel)
     logInfo("Checkpoint interval = " + checkpointInterval)
     logInfo("Remember duration = " + rememberDuration)
-    logInfo("Initialized " + this)
+    logInfo("Initialized and validated " + this)
   }
 
   protected[streaming] def setContext(s: StreamingContext) {
diff --git a/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala b/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala
index bb852cbcca..f63a9e0011 100644
--- a/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/ReducedWindowedDStream.scala
@@ -118,8 +118,8 @@ class ReducedWindowedDStream[K: ClassManifest, V: ClassManifest](
       if (seqOfValues(0).isEmpty) {
         // If previous window's reduce value does not exist, then at least new values should exist
         if (newValues.isEmpty) {
-          val info = "seqOfValues =\n" + seqOfValues.map(x => "[" + x.mkString(",") + "]").mkString("\n")
-          throw new Exception("Neither previous window has value for key, nor new values found\n" + info)
+          throw new Exception("Neither previous window has value for key, nor new values found. " +
+            "Are you sure your key class hashes consistently?")
         }
         // Reduce the new values
         newValues.reduce(reduceF) // return
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 63d8766749..9c19f6588d 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -17,6 +17,7 @@ import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 import org.apache.hadoop.fs.Path
 import java.util.UUID
+import spark.util.MetadataCleaner
 
 /**
  * A StreamingContext is the main entry point for Spark Streaming functionality. Besides the basic
@@ -268,8 +269,11 @@ class StreamingContext private (
 object StreamingContext {
 
   def createNewSparkContext(master: String, frameworkName: String): SparkContext = {
-    if (System.getProperty("spark.cleanup.delay", "-1").toDouble < 0) {
-      System.setProperty("spark.cleanup.delay", "60")
+
+    // Set the default cleaner delay to an hour if not already set.
+    // This should be sufficient for even 1 second interval.
+    if (MetadataCleaner.getDelaySeconds < 0) {
+      MetadataCleaner.setDelaySeconds(60)
     }
     new SparkContext(master, frameworkName)
   }
-- 
cgit v1.2.3


From b4dba55f78b0dfda728cf69c9c17e4863010d28d Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sun, 2 Dec 2012 02:03:05 +0000
Subject: Made RDD checkpoint not create a new thread. Fixed bug in detecting
 when spark.cleaner.delay is insufficient.

---
 core/src/main/scala/spark/RDD.scala                | 31 +++++++---------------
 .../main/scala/spark/util/TimeStampedHashMap.scala |  3 ++-
 .../src/main/scala/spark/streaming/DStream.scala   |  9 ++++---
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'streaming')

diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala
index 8af6c9bd6a..fbfcfbd704 100644
--- a/core/src/main/scala/spark/RDD.scala
+++ b/core/src/main/scala/spark/RDD.scala
@@ -211,28 +211,17 @@ abstract class RDD[T: ClassManifest](
 
     if (startCheckpoint) {
       val rdd = this
-      val env = SparkEnv.get
-
-      // Spawn a new thread to do the checkpoint as it takes sometime to write the RDD to file
-      val th = new Thread() {
-        override def run() {
-          // Save the RDD to a file, create a new HadoopRDD from it,
-          // and change the dependencies from the original parents to the new RDD
-          SparkEnv.set(env)
-          rdd.checkpointFile = new Path(context.checkpointDir, "rdd-" + id).toString
-          rdd.saveAsObjectFile(checkpointFile)
-          rdd.synchronized {
-            rdd.checkpointRDD = context.objectFile[T](checkpointFile, rdd.splits.size)
-            rdd.checkpointRDDSplits = rdd.checkpointRDD.splits
-            rdd.changeDependencies(rdd.checkpointRDD)
-            rdd.shouldCheckpoint = false
-            rdd.isCheckpointInProgress = false
-            rdd.isCheckpointed = true
-            println("Done checkpointing RDD " + rdd.id + ", " + rdd)
-          }
-        }
+      rdd.checkpointFile = new Path(context.checkpointDir, "rdd-" + id).toString
+      rdd.saveAsObjectFile(checkpointFile)
+      rdd.synchronized {
+        rdd.checkpointRDD = context.objectFile[T](checkpointFile, rdd.splits.size)
+        rdd.checkpointRDDSplits = rdd.checkpointRDD.splits
+        rdd.changeDependencies(rdd.checkpointRDD)
+        rdd.shouldCheckpoint = false
+        rdd.isCheckpointInProgress = false
+        rdd.isCheckpointed = true
+        println("Done checkpointing RDD " + rdd.id + ", " + rdd + ", created RDD " + rdd.checkpointRDD.id + ", " + rdd.checkpointRDD)
       }
-      th.start()
     } else {
       // Recursively call doCheckpoint() to perform checkpointing on parent RDD if they are marked
       dependencies.foreach(_.rdd.doCheckpoint())
diff --git a/core/src/main/scala/spark/util/TimeStampedHashMap.scala b/core/src/main/scala/spark/util/TimeStampedHashMap.scala
index 9bcc9245c0..52f03784db 100644
--- a/core/src/main/scala/spark/util/TimeStampedHashMap.scala
+++ b/core/src/main/scala/spark/util/TimeStampedHashMap.scala
@@ -10,7 +10,7 @@ import java.util.concurrent.ConcurrentHashMap
  * threshold time can them be removed using the cleanup method. This is intended to be a drop-in
  * replacement of scala.collection.mutable.HashMap.
  */
-class TimeStampedHashMap[A, B] extends Map[A, B]() {
+class TimeStampedHashMap[A, B] extends Map[A, B]() with spark.Logging {
   val internalMap = new ConcurrentHashMap[A, (B, Long)]()
 
   def get(key: A): Option[B] = {
@@ -79,6 +79,7 @@ class TimeStampedHashMap[A, B] extends Map[A, B]() {
     while(iterator.hasNext) {
       val entry = iterator.next()
       if (entry.getValue._2 < threshTime) {
+        logDebug("Removing key " + entry.getKey)
         iterator.remove()
       }
     }
diff --git a/streaming/src/main/scala/spark/streaming/DStream.scala b/streaming/src/main/scala/spark/streaming/DStream.scala
index 28a3e2dfc7..d2e9de110e 100644
--- a/streaming/src/main/scala/spark/streaming/DStream.scala
+++ b/streaming/src/main/scala/spark/streaming/DStream.scala
@@ -182,14 +182,15 @@ extends Serializable with Logging {
         checkpointInterval + "). Please set it to higher than " + checkpointInterval + "."
     )
 
-    val metadataCleanupDelay = System.getProperty("spark.cleanup.delay", "-1").toDouble
+    val metadataCleanerDelay = spark.util.MetadataCleaner.getDelaySeconds
+    logInfo("metadataCleanupDelay = " + metadataCleanerDelay)
     assert(
-      metadataCleanupDelay < 0 || rememberDuration < metadataCleanupDelay * 60 * 1000,
+      metadataCleanerDelay < 0 || rememberDuration < metadataCleanerDelay * 1000,
       "It seems you are doing some DStream window operation or setting a checkpoint interval " +
         "which requires " + this.getClass.getSimpleName + " to remember generated RDDs for more " +
         "than " + rememberDuration.milliseconds + " milliseconds. But the Spark's metadata cleanup" +
-        "delay is set to " + metadataCleanupDelay + " minutes, which is not sufficient. Please set " +
-        "the Java property 'spark.cleanup.delay' to more than " +
+        "delay is set to " + (metadataCleanerDelay / 60.0) + " minutes, which is not sufficient. Please set " +
+        "the Java property 'spark.cleaner.delay' to more than " +
         math.ceil(rememberDuration.millis.toDouble / 60000.0).toInt + " minutes."
     )
 
-- 
cgit v1.2.3


From 609e00d599d3f429a838f598b3f32c5fdbd7ec5e Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Sun, 2 Dec 2012 02:39:08 +0000
Subject: Minor mods

---
 streaming/src/main/scala/spark/streaming/NetworkInputTracker.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'streaming')

diff --git a/streaming/src/main/scala/spark/streaming/NetworkInputTracker.scala b/streaming/src/main/scala/spark/streaming/NetworkInputTracker.scala
index d0fef70f7e..ae6692290e 100644
--- a/streaming/src/main/scala/spark/streaming/NetworkInputTracker.scala
+++ b/streaming/src/main/scala/spark/streaming/NetworkInputTracker.scala
@@ -58,7 +58,7 @@ class NetworkInputTracker(
           throw new Exception("Register received for unexpected id " + streamId)        
         }
         receiverInfo += ((streamId, receiverActor))
-        logInfo("Registered receiver for network stream " + streamId)
+        logInfo("Registered receiver for network stream " + streamId + " from " + sender.path.address)
         sender ! true
       } 
       case AddBlocks(streamId, blockIds) => {
-- 
cgit v1.2.3