Merge pull request #455 from tdas/streaming

Merging latest master branch changes to the streaming branch
author: Tathagata Das <tathagata.das1565@gmail.com> 2013-02-07 15:05:20 -0800
committer: Tathagata Das <tathagata.das1565@gmail.com> 2013-02-07 15:05:20 -0800
commit: bcee3cb2db2f213efdc142d83419dfd9e35bc4fe (patch)
tree: 0aa12b24f0074f3c6522a8ab3c16bf8ac9097061 /streaming/src
parent: 12300758ccbe7d45c6149d56772133ae1bc5cb25 (diff)
parent: 4cc223b4785c9da39c4a35d2adb7339dfa8e47e6 (diff)
download: spark-bcee3cb2db2f213efdc142d83419dfd9e35bc4fe.tar.gz
spark-bcee3cb2db2f213efdc142d83419dfd9e35bc4fe.tar.bz2
spark-bcee3cb2db2f213efdc142d83419dfd9e35bc4fe.zip
18 files changed, 404 insertions, 268 deletions
diff --git a/streaming/src/main/scala/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/spark/streaming/Checkpoint.scala
index 2f3adb39c2..b9eb7f8ec4 100644
--- a/streaming/src/main/scala/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/spark/streaming/Checkpoint.scala
@@ -17,7 +17,8 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
   val jars = ssc.sc.jars
   val graph = ssc.graph
   val checkpointDir = ssc.checkpointDir
-  val checkpointDuration: Duration = ssc.checkpointDuration
+  val checkpointDuration = ssc.checkpointDuration
+  val pendingTimes = ssc.scheduler.jobManager.getPendingTimes()
 
   def validate() {
     assert(master != null, "Checkpoint.master is null")
diff --git a/streaming/src/main/scala/spark/streaming/DStream.scala b/streaming/src/main/scala/spark/streaming/DStream.scala
index b11ef443dc..0eb6aad187 100644
--- a/streaming/src/main/scala/spark/streaming/DStream.scala
+++ b/streaming/src/main/scala/spark/streaming/DStream.scala
@@ -12,7 +12,7 @@ import scala.collection.mutable.HashMap
 
 import java.io.{ObjectInputStream, IOException, ObjectOutputStream}
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
 
 /**
@@ -75,7 +75,7 @@ abstract class DStream[T: ClassManifest] (
   // Checkpoint details
   protected[streaming] val mustCheckpoint = false
   protected[streaming] var checkpointDuration: Duration = null
-  protected[streaming] var checkpointData = new DStreamCheckpointData(HashMap[Time, Any]())
+  protected[streaming] val checkpointData = new DStreamCheckpointData(this)
 
   // Reference to whole DStream graph
   protected[streaming] var graph: DStreamGraph = null
@@ -85,10 +85,10 @@ abstract class DStream[T: ClassManifest] (
   // Duration for which the DStream requires its parent DStream to remember each RDD created
   protected[streaming] def parentRememberDuration = rememberDuration
 
-  /** Returns the StreamingContext associated with this DStream */
-  def context() = ssc
+  /** Return the StreamingContext associated with this DStream */
+  def context = ssc
 
-  /** Persists the RDDs of this DStream with the given storage level */
+  /** Persist the RDDs of this DStream with the given storage level */
   def persist(level: StorageLevel): DStream[T] = {
     if (this.isInitialized) {
       throw new UnsupportedOperationException(
@@ -159,7 +159,7 @@ abstract class DStream[T: ClassManifest] (
     )
 
     assert(
-     checkpointDuration == null || ssc.sc.checkpointDir.isDefined,
+     checkpointDuration == null || context.sparkContext.checkpointDir.isDefined,
       "The checkpoint directory has not been set. Please use StreamingContext.checkpoint()" +
       " or SparkContext.checkpoint() to set the checkpoint directory."
     )
@@ -198,10 +198,10 @@ abstract class DStream[T: ClassManifest] (
       metadataCleanerDelay < 0 || rememberDuration.milliseconds < metadataCleanerDelay * 1000,
       "It seems you are doing some DStream window operation or setting a checkpoint interval " +
         "which requires " + this.getClass.getSimpleName + " to remember generated RDDs for more " +
-        "than " + rememberDuration.milliseconds + " milliseconds. But the Spark's metadata cleanup" +
-        "delay is set to " + (metadataCleanerDelay / 60.0) + " minutes, which is not sufficient. Please set " +
-        "the Java property 'spark.cleaner.delay' to more than " +
-        math.ceil(rememberDuration.milliseconds.toDouble / 60000.0).toInt + " minutes."
+        "than " + rememberDuration.milliseconds / 1000 + " seconds. But Spark's metadata cleanup" +
+        "delay is set to " + metadataCleanerDelay + " seconds, which is not sufficient. Please " +
+        "set the Java property 'spark.cleaner.delay' to more than " +
+        math.ceil(rememberDuration.milliseconds / 1000.0).toInt + " seconds."
     )
 
     dependencies.foreach(_.validate())
@@ -298,8 +298,8 @@ abstract class DStream[T: ClassManifest] (
     getOrCompute(time) match {
       case Some(rdd) => {
         val jobFunc = () => {
-          val emptyFunc = { (iterator: Iterator[T]) => {} } 
-          ssc.sc.runJob(rdd, emptyFunc)
+          val emptyFunc = { (iterator: Iterator[T]) => {} }
+          context.sparkContext.runJob(rdd, emptyFunc)
         }
         Some(new Job(time, jobFunc))
       }
@@ -310,10 +310,9 @@ abstract class DStream[T: ClassManifest] (
   /**
    * Dereference RDDs that are older than rememberDuration.
    */
-  protected[streaming] def forgetOldRDDs(time: Time) {
-    val keys = generatedRDDs.keys
+  protected[streaming] def forgetOldMetadata(time: Time) {
     var numForgotten = 0
-    keys.foreach(t => {
+    generatedRDDs.keys.foreach(t => {
       if (t <= (time - rememberDuration)) {
         generatedRDDs.remove(t)
         numForgotten += 1
@@ -321,7 +320,7 @@ abstract class DStream[T: ClassManifest] (
       }
     })
     logInfo("Forgot " + numForgotten + " RDDs from " + this)
-    dependencies.foreach(_.forgetOldRDDs(time))
+    dependencies.foreach(_.forgetOldMetadata(time))
   }
 
   /* Adds metadata to the Stream while it is running. 
@@ -342,40 +341,10 @@ abstract class DStream[T: ClassManifest] (
    */
   protected[streaming] def updateCheckpointData(currentTime: Time) {
     logInfo("Updating checkpoint data for time " + currentTime)
-
-    // Get the checkpointed RDDs from the generated RDDs
-    val newRdds = generatedRDDs.filter(_._2.getCheckpointFile.isDefined)
-                               .map(x => (x._1, x._2.getCheckpointFile.get))
-
-    // Make a copy of the existing checkpoint data (checkpointed RDDs)
-    val oldRdds = checkpointData.rdds.clone()
-
-    // If the new checkpoint data has checkpoints then replace existing with the new one
-    if (newRdds.size > 0) {
-      checkpointData.rdds.clear()
-      checkpointData.rdds ++= newRdds
-    }
-
-    // Make parent DStreams update their checkpoint data
+    checkpointData.update()
     dependencies.foreach(_.updateCheckpointData(currentTime))
-
-    // TODO: remove this, this is just for debugging
-    newRdds.foreach {
-      case (time, data) => { logInfo("Added checkpointed RDD for time " + time + " to stream checkpoint") }
-    }
-
-    if (newRdds.size > 0) {
-      (oldRdds -- newRdds.keySet).foreach {
-        case (time, data) => {
-          val path = new Path(data.toString)
-          val fs = path.getFileSystem(new Configuration())
-          fs.delete(path, true)
-          logInfo("Deleted checkpoint file '" + path + "' for time " + time)
-        }
-      }
-    }
-    logInfo("Updated checkpoint data for time " + currentTime + ", " + checkpointData.rdds.size + " checkpoints, " 
-      + "[" + checkpointData.rdds.mkString(",") + "]")
+    checkpointData.cleanup()
+    logDebug("Updated checkpoint data for time " + currentTime + ": " + checkpointData)
   }
 
   /**
@@ -386,14 +355,8 @@ abstract class DStream[T: ClassManifest] (
    */
   protected[streaming] def restoreCheckpointData() {
     // Create RDDs from the checkpoint data
-    logInfo("Restoring checkpoint data from " + checkpointData.rdds.size + " checkpointed RDDs")
-    checkpointData.rdds.foreach {
-      case(time, data) => {
-        logInfo("Restoring checkpointed RDD for time " + time + " from file '" + data.toString + "'")
-        val rdd = ssc.sc.checkpointFile[T](data.toString)
-        generatedRDDs += ((time, rdd))
-      }
-    }
+    logInfo("Restoring checkpoint data")
+    checkpointData.restore()
     dependencies.foreach(_.restoreCheckpointData())
     logInfo("Restored checkpoint data")
   }
@@ -433,7 +396,7 @@ abstract class DStream[T: ClassManifest] (
 
   /** Return a new DStream by applying a function to all elements of this DStream. */
   def map[U: ClassManifest](mapFunc: T => U): DStream[U] = {
-    new MappedDStream(this, ssc.sc.clean(mapFunc))
+    new MappedDStream(this, context.sparkContext.clean(mapFunc))
   }
 
   /**
@@ -441,7 +404,7 @@ abstract class DStream[T: ClassManifest] (
    * and then flattening the results
    */
   def flatMap[U: ClassManifest](flatMapFunc: T => Traversable[U]): DStream[U] = {
-    new FlatMappedDStream(this, ssc.sc.clean(flatMapFunc))
+    new FlatMappedDStream(this, context.sparkContext.clean(flatMapFunc))
   }
 
   /** Return a new DStream containing only the elements that satisfy a predicate. */
@@ -463,7 +426,7 @@ abstract class DStream[T: ClassManifest] (
       mapPartFunc: Iterator[T] => Iterator[U],
       preservePartitioning: Boolean = false
     ): DStream[U] = {
-    new MapPartitionedDStream(this, ssc.sc.clean(mapPartFunc), preservePartitioning)
+    new MapPartitionedDStream(this, context.sparkContext.clean(mapPartFunc), preservePartitioning)
   }
 
   /**
@@ -492,7 +455,7 @@ abstract class DStream[T: ClassManifest] (
    * this DStream will be registered as an output stream and therefore materialized.
    */
   def foreach(foreachFunc: (RDD[T], Time) => Unit) {
-    val newStream = new ForEachDStream(this, ssc.sc.clean(foreachFunc))
+    val newStream = new ForEachDStream(this, context.sparkContext.clean(foreachFunc))
     ssc.registerOutputStream(newStream)
     newStream
   }
@@ -510,7 +473,7 @@ abstract class DStream[T: ClassManifest] (
    * on each RDD of this DStream.
    */
   def transform[U: ClassManifest](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = {
-    new TransformedDStream(this, ssc.sc.clean(transformFunc))
+    new TransformedDStream(this, context.sparkContext.clean(transformFunc))
   }
 
   /**
@@ -527,7 +490,7 @@ abstract class DStream[T: ClassManifest] (
       if (first11.size > 10) println("...")
       println()
     }
-    val newStream = new ForEachDStream(this, ssc.sc.clean(foreachFunc))
+    val newStream = new ForEachDStream(this, context.sparkContext.clean(foreachFunc))
     ssc.registerOutputStream(newStream)
   }
 
@@ -651,7 +614,3 @@ abstract class DStream[T: ClassManifest] (
     ssc.registerOutputStream(this)
   }
 }
-
-private[streaming]
-case class DStreamCheckpointData(rdds: HashMap[Time, Any])
-
diff --git a/streaming/src/main/scala/spark/streaming/DStreamCheckpointData.scala b/streaming/src/main/scala/spark/streaming/DStreamCheckpointData.scala
new file mode 100644
index 0000000000..a375980b84
--- /dev/null
+++ b/streaming/src/main/scala/spark/streaming/DStreamCheckpointData.scala
@@ -0,0 +1,93 @@
+package spark.streaming
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.FileSystem
+import org.apache.hadoop.conf.Configuration
+import collection.mutable.HashMap
+import spark.Logging
+
+
+
+private[streaming]
+class DStreamCheckpointData[T: ClassManifest] (dstream: DStream[T])
+  extends Serializable with Logging {
+  protected val data = new HashMap[Time, AnyRef]()
+
+  @transient private var fileSystem : FileSystem = null
+  @transient private var lastCheckpointFiles: HashMap[Time, String] = null
+
+  protected[streaming] def checkpointFiles = data.asInstanceOf[HashMap[Time, String]]
+
+  /**
+   * Updates the checkpoint data of the DStream. This gets called every time
+   * the graph checkpoint is initiated. Default implementation records the
+   * checkpoint files to which the generate RDDs of the DStream has been saved.
+   */
+  def update() {
+
+    // Get the checkpointed RDDs from the generated RDDs
+    val newCheckpointFiles = dstream.generatedRDDs.filter(_._2.getCheckpointFile.isDefined)
+                                       .map(x => (x._1, x._2.getCheckpointFile.get))
+
+    // Make a copy of the existing checkpoint data (checkpointed RDDs)
+    lastCheckpointFiles = checkpointFiles.clone()
+
+    // If the new checkpoint data has checkpoints then replace existing with the new one
+    if (newCheckpointFiles.size > 0) {
+      checkpointFiles.clear()
+      checkpointFiles ++= newCheckpointFiles
+    }
+
+    // TODO: remove this, this is just for debugging
+    newCheckpointFiles.foreach {
+      case (time, data) => { logInfo("Added checkpointed RDD for time " + time + " to stream checkpoint") }
+    }
+  }
+
+  /**
+   * Cleanup old checkpoint data. This gets called every time the graph
+   * checkpoint is initiated, but after `update` is called. Default
+   * implementation, cleans up old checkpoint files.
+   */
+  def cleanup() {
+    // If there is at least on checkpoint file in the current checkpoint files,
+    // then delete the old checkpoint files.
+    if (checkpointFiles.size > 0 && lastCheckpointFiles != null) {
+      (lastCheckpointFiles -- checkpointFiles.keySet).foreach {
+        case (time, file) => {
+          try {
+            val path = new Path(file)
+            if (fileSystem == null) {
+              fileSystem = path.getFileSystem(new Configuration())
+            }
+            fileSystem.delete(path, true)
+            logInfo("Deleted checkpoint file '" + file + "' for time " + time)
+          } catch {
+            case e: Exception =>
+              logWarning("Error deleting old checkpoint file '" + file + "' for time " + time, e)
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Restore the checkpoint data. This gets called once when the DStream graph
+   * (along with its DStreams) are being restored from a graph checkpoint file.
+   * Default implementation restores the RDDs from their checkpoint files.
+   */
+  def restore() {
+    // Create RDDs from the checkpoint data
+    checkpointFiles.foreach {
+      case(time, file) => {
+        logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'")
+        dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file)))
+      }
+    }
+  }
+
+  override def toString() = {
+    "[\n" + checkpointFiles.size + "\n" + checkpointFiles.mkString("\n") + "\n]"
+  }
+}
+
diff --git a/streaming/src/main/scala/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/spark/streaming/DStreamGraph.scala
index bc4a40d7bc..d5a5496839 100644
--- a/streaming/src/main/scala/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/spark/streaming/DStreamGraph.scala
@@ -87,7 +87,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
 
   private[streaming] def forgetOldRDDs(time: Time) {
     this.synchronized {
-      outputStreams.foreach(_.forgetOldRDDs(time))
+      outputStreams.foreach(_.forgetOldMetadata(time))
     }
   }
 
diff --git a/streaming/src/main/scala/spark/streaming/JobManager.scala b/streaming/src/main/scala/spark/streaming/JobManager.scala
index 3b910538e0..5acdd01e58 100644
--- a/streaming/src/main/scala/spark/streaming/JobManager.scala
+++ b/streaming/src/main/scala/spark/streaming/JobManager.scala
@@ -3,6 +3,8 @@ package spark.streaming
 import spark.Logging 
 import spark.SparkEnv
 import java.util.concurrent.Executors
+import collection.mutable.HashMap
+import collection.mutable.ArrayBuffer
 
 
 private[streaming]
@@ -19,15 +21,41 @@ class JobManager(ssc: StreamingContext, numThreads: Int = 1) extends Logging {
         case e: Exception =>
           logError("Running " + job + " failed", e)
       }
+      clearJob(job)
     }
   }
 
   initLogging()
 
   val jobExecutor = Executors.newFixedThreadPool(numThreads) 
-  
+  val jobs = new HashMap[Time, ArrayBuffer[Job]]
+
   def runJob(job: Job) {
+    jobs.synchronized {
+      jobs.getOrElseUpdate(job.time, new ArrayBuffer[Job]) += job
+    }
     jobExecutor.execute(new JobHandler(ssc, job))
     logInfo("Added " + job + " to queue")
   }
+
+  private def clearJob(job: Job) {
+    jobs.synchronized {
+      val jobsOfTime = jobs.get(job.time)
+      if (jobsOfTime.isDefined) {
+        jobsOfTime.get -= job
+        if (jobsOfTime.get.isEmpty) {
+          jobs -= job.time
+        }
+      } else {
+        throw new Exception("Job finished for time " + job.time +
+          " but time does not exist in jobs")
+      }
+    }
+  }
+
+  def getPendingTimes(): Array[Time] = {
+    jobs.synchronized {
+      jobs.keySet.toArray
+    }
+  }
 }
diff --git a/streaming/src/main/scala/spark/streaming/Scheduler.scala b/streaming/src/main/scala/spark/streaming/Scheduler.scala
index c04ed37de8..b77986a3ba 100644
--- a/streaming/src/main/scala/spark/streaming/Scheduler.scala
+++ b/streaming/src/main/scala/spark/streaming/Scheduler.scala
@@ -35,10 +35,13 @@ class Scheduler(ssc: StreamingContext) extends Logging {
       // either set the manual clock to the last checkpointed time,
       // or if the property is defined set it to that time
       if (clock.isInstanceOf[ManualClock]) {
-        val lastTime = ssc.getInitialCheckpoint.checkpointTime.milliseconds
+        val lastTime = ssc.initialCheckpoint.checkpointTime.milliseconds
         val jumpTime = System.getProperty("spark.streaming.manualClock.jump", "0").toLong
         clock.asInstanceOf[ManualClock].setTime(lastTime + jumpTime)
       }
+      // Reschedule the batches that were received but not processed before failure
+      ssc.initialCheckpoint.pendingTimes.foreach(time => generateRDDs(time))
+      // Restart the timer
       timer.restart(graph.zeroTime.milliseconds)
       logInfo("Scheduler's timer restarted")
     } else {
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 86bd4844e9..8cfbec51d2 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -61,7 +61,7 @@ class StreamingContext private (
 
   protected[streaming] val isCheckpointPresent = (cp_ != null)
 
-  val sc: SparkContext = {
+  protected[streaming] val sc: SparkContext = {
     if (isCheckpointPresent) {
       new SparkContext(cp_.master, cp_.framework, cp_.sparkHome, cp_.jars)
     } else {
@@ -101,6 +101,11 @@ class StreamingContext private (
   protected[streaming] var scheduler: Scheduler = null
 
   /**
+   * Returns the associated Spark context
+   */
+  def sparkContext = sc
+
+  /**
    * Sets each DStreams in this context to remember RDDs it generated in the last given duration.
    * DStreams remember RDDs only for a limited duration of time and releases them for garbage
    * collection. This method allows the developer to specify how to long to remember the RDDs (
@@ -128,7 +133,7 @@ class StreamingContext private (
     }
   }
 
-  protected[streaming] def getInitialCheckpoint(): Checkpoint = {
+  protected[streaming] def initialCheckpoint: Checkpoint = {
     if (isCheckpointPresent) cp_ else null
   }
 
@@ -374,7 +379,7 @@ class StreamingContext private (
   }
 
   /**
-   * Sstops the execution of the streams.
+   * Stops the execution of the streams.
    */
   def stop() {
     try {
@@ -401,7 +406,7 @@ object StreamingContext {
     // Set the default cleaner delay to an hour if not already set.
     // This should be sufficient for even 1 second interval.
     if (MetadataCleaner.getDelaySeconds < 0) {
-      MetadataCleaner.setDelaySeconds(60)
+      MetadataCleaner.setDelaySeconds(3600)
     }
     new SparkContext(master, frameworkName)
   }
diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala
index f82e6a37cc..5bbf2b084f 100644
--- a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala
@@ -34,6 +34,14 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(master, frameworkName, batchDuration))
 
   /**
+   * Creates a StreamingContext.
+   * @param sparkContext The underlying JavaSparkContext to use
+   * @param batchDuration The time interval at which streaming data will be divided into batches
+   */
+  def this(sparkContext: JavaSparkContext, batchDuration: Duration) =
+    this(new StreamingContext(sparkContext.sc, batchDuration))
+
+  /**
    * Re-creates a StreamingContext from a checkpoint file.
    * @param path Path either to the directory that was specified as the checkpoint directory, or
    *             to the checkpoint file 'graph' or 'graph.bk'.
@@ -45,27 +53,24 @@ class JavaStreamingContext(val ssc: StreamingContext) {
 
   /**
    * Create an input stream that pulls messages form a Kafka Broker.
-   * @param hostname Zookeper hostname.
-   * @param port Zookeper port.
+   * @param zkQuorum Zookeper quorum (hostname:port,hostname:port,..).
    * @param groupId The group id for this consumer.
    * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
    * in its own thread.
    */
   def kafkaStream[T](
-    hostname: String,
-    port: Int,
+    zkQuorum: String,
     groupId: String,
     topics: JMap[String, JInt])
   : JavaDStream[T] = {
     implicit val cmt: ClassManifest[T] =
       implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
-    ssc.kafkaStream[T](hostname, port, groupId, Map(topics.mapValues(_.intValue()).toSeq: _*))
+    ssc.kafkaStream[T](zkQuorum, groupId, Map(topics.mapValues(_.intValue()).toSeq: _*))
   }
 
   /**
    * Create an input stream that pulls messages form a Kafka Broker.
-   * @param hostname Zookeper hostname.
-   * @param port Zookeper port.
+   * @param zkQuorum Zookeper quorum (hostname:port,hostname:port,..).
    * @param groupId The group id for this consumer.
    * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
    * in its own thread.
@@ -73,8 +78,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
    * By default the value is pulled from zookeper.
    */
   def kafkaStream[T](
-    hostname: String,
-    port: Int,
+    zkQuorum: String,
     groupId: String,
     topics: JMap[String, JInt],
     initialOffsets: JMap[KafkaPartitionKey, JLong])
@@ -82,8 +86,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     implicit val cmt: ClassManifest[T] =
       implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
     ssc.kafkaStream[T](
-      hostname,
-      port,
+      zkQuorum,
       groupId,
       Map(topics.mapValues(_.intValue()).toSeq: _*),
       Map(initialOffsets.mapValues(_.longValue()).toSeq: _*))
@@ -91,8 +94,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
 
   /**
    * Create an input stream that pulls messages form a Kafka Broker.
-   * @param hostname Zookeper hostname.
-   * @param port Zookeper port.
+   * @param zkQuorum Zookeper quorum (hostname:port,hostname:port,..).
    * @param groupId The group id for this consumer.
    * @param topics Map of (topic_name -> numPartitions) to consume. Each partition is consumed
    * in its own thread.
@@ -101,8 +103,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
    * @param storageLevel RDD storage level. Defaults to memory-only
    */
   def kafkaStream[T](
-    hostname: String,
-    port: Int,
+    zkQuorum: String,
     groupId: String,
     topics: JMap[String, JInt],
     initialOffsets: JMap[KafkaPartitionKey, JLong],
@@ -111,8 +112,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     implicit val cmt: ClassManifest[T] =
       implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]]
     ssc.kafkaStream[T](
-      hostname,
-      port,
+      zkQuorum,
       groupId,
       Map(topics.mapValues(_.intValue()).toSeq: _*),
       Map(initialOffsets.mapValues(_.longValue()).toSeq: _*),
diff --git a/streaming/src/main/scala/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/FileInputDStream.scala
index 1e6ad84b44..c6ffb252ce 100644
--- a/streaming/src/main/scala/spark/streaming/dstream/FileInputDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/dstream/FileInputDStream.scala
@@ -2,13 +2,14 @@ package spark.streaming.dstream
 
 import spark.RDD
 import spark.rdd.UnionRDD
-import spark.streaming.{StreamingContext, Time}
+import spark.streaming.{DStreamCheckpointData, StreamingContext, Time}
 
 import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
 
-import scala.collection.mutable.HashSet
+import scala.collection.mutable.{HashSet, HashMap}
+import java.io.{ObjectInputStream, IOException}
 
 private[streaming]
 class FileInputDStream[K: ClassManifest, V: ClassManifest, F <: NewInputFormat[K,V] : ClassManifest](
@@ -18,21 +19,14 @@ class FileInputDStream[K: ClassManifest, V: ClassManifest, F <: NewInputFormat[K
     newFilesOnly: Boolean = true) 
   extends InputDStream[(K, V)](ssc_) {
 
-  @transient private var path_ : Path = null
-  @transient private var fs_ : FileSystem = null
-
-  var lastModTime = 0L
-  val lastModTimeFiles = new HashSet[String]()
+  protected[streaming] override val checkpointData = new FileInputDStreamCheckpointData
 
-  def path(): Path = {
-    if (path_ == null) path_ = new Path(directory)
-    path_
-  }
+  private val lastModTimeFiles = new HashSet[String]()
+  private var lastModTime = 0L
 
-  def fs(): FileSystem = {
-    if (fs_ == null) fs_ = path.getFileSystem(new Configuration())
-    fs_
-  }
+  @transient private var path_ : Path = null
+  @transient private var fs_ : FileSystem = null
+  @transient private var files = new HashMap[Time, Array[String]]
 
   override def start() {
     if (newFilesOnly) {
@@ -79,8 +73,8 @@ class FileInputDStream[K: ClassManifest, V: ClassManifest, F <: NewInputFormat[K
       }
     }
 
-    val newFiles = fs.listStatus(path, newFilter)
-    logInfo("New files: " + newFiles.map(_.getPath).mkString(", "))
+    val newFiles = fs.listStatus(path, newFilter).map(_.getPath.toString)
+    logInfo("New files: " + newFiles.mkString(", "))
     if (newFiles.length > 0) {
       // Update the modification time and the files processed for that modification time
       if (lastModTime != newFilter.latestModTime) {
@@ -89,9 +83,70 @@ class FileInputDStream[K: ClassManifest, V: ClassManifest, F <: NewInputFormat[K
       }
       lastModTimeFiles ++= newFilter.latestModTimeFiles
     }
-    val newRDD = new UnionRDD(ssc.sc, newFiles.map(
-      file => ssc.sc.newAPIHadoopFile[K, V, F](file.getPath.toString)))
-    Some(newRDD)
+    files += ((validTime, newFiles))
+    Some(filesToRDD(newFiles))
+  }
+
+  /** Forget the old time-to-files mappings along with old RDDs */
+  protected[streaming] override def forgetOldMetadata(time: Time) {
+    super.forgetOldMetadata(time)
+    val filesToBeRemoved = files.filter(_._1 <= (time - rememberDuration))
+    files --= filesToBeRemoved.keys
+    logInfo("Forgot " + filesToBeRemoved.size + " files from " + this)
+  }
+
+  /** Generate one RDD from an array of files */
+  protected[streaming] def filesToRDD(files: Seq[String]): RDD[(K, V)] = {
+    new UnionRDD(
+      context.sparkContext,
+      files.map(file => context.sparkContext.newAPIHadoopFile[K, V, F](file))
+    )
+  }
+
+  private def path: Path = {
+    if (path_ == null) path_ = new Path(directory)
+    path_
+  }
+
+  private def fs: FileSystem = {
+    if (fs_ == null) fs_ = path.getFileSystem(new Configuration())
+    fs_
+  }
+
+  @throws(classOf[IOException])
+  private def readObject(ois: ObjectInputStream) {
+    logDebug(this.getClass().getSimpleName + ".readObject used")
+    ois.defaultReadObject()
+    generatedRDDs = new HashMap[Time, RDD[(K,V)]] ()
+    files = new HashMap[Time, Array[String]]
+  }
+
+  /**
+   * A custom version of the DStreamCheckpointData that stores names of
+   * Hadoop files as checkpoint data.
+   */
+  private[streaming]
+  class FileInputDStreamCheckpointData extends DStreamCheckpointData(this) {
+
+     def hadoopFiles = data.asInstanceOf[HashMap[Time, Array[String]]]
+
+    override def update() {
+      hadoopFiles.clear()
+      hadoopFiles ++= files
+    }
+
+    override def cleanup() { }
+
+    override def restore() {
+      hadoopFiles.foreach {
+        case (time, files) => {
+          logInfo("Restoring Hadoop RDD for time " + time + " from files " +
+            files.mkString("[", ",", "]") )
+          files
+          generatedRDDs += ((time, filesToRDD(files)))
+        }
+      }
+    }
   }
 }
 
@@ -100,3 +155,4 @@ object FileInputDStream {
   def defaultFilter(path: Path): Boolean = !path.getName().startsWith(".")
 }
 
+
diff --git a/streaming/src/main/scala/spark/streaming/dstream/NetworkInputDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/NetworkInputDStream.scala
index aa6be95f30..8c322dd698 100644
--- a/streaming/src/main/scala/spark/streaming/dstream/NetworkInputDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/dstream/NetworkInputDStream.scala
@@ -153,8 +153,8 @@ abstract class NetworkReceiver[T: ClassManifest]() extends Serializable with Log
   /** A helper actor that communicates with the NetworkInputTracker */
   private class NetworkReceiverActor extends Actor {
     logInfo("Attempting to register with tracker")
-    val ip = System.getProperty("spark.master.host", "localhost")
-    val port = System.getProperty("spark.master.port", "7077").toInt
+    val ip = System.getProperty("spark.driver.host", "localhost")
+    val port = System.getProperty("spark.driver.port", "7077").toInt
     val url = "akka://spark@%s:%s/user/NetworkInputTracker".format(ip, port)
     val tracker = env.actorSystem.actorFor(url)
     val timeout = 5.seconds
diff --git a/streaming/src/main/scala/spark/streaming/dstream/RawInputDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/RawInputDStream.scala
index 290fab1ce0..04e6b69b7b 100644
--- a/streaming/src/main/scala/spark/streaming/dstream/RawInputDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/dstream/RawInputDStream.scala
@@ -1,6 +1,6 @@
 package spark.streaming.dstream
 
-import spark.{DaemonThread, Logging}
+import spark.Logging
 import spark.storage.StorageLevel
 import spark.streaming.StreamingContext
 
@@ -48,7 +48,8 @@ class RawNetworkReceiver(host: String, port: Int, storageLevel: StorageLevel)
 
     val queue = new ArrayBlockingQueue[ByteBuffer](2)
 
-    blockPushingThread = new DaemonThread {
+    blockPushingThread = new Thread {
+      setDaemon(true)
       override def run() {
         var nextBlockNumber = 0
         while (true) {
diff --git a/streaming/src/test/java/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java
index c84e7331c7..fbe4af4597 100644
--- a/streaming/src/test/java/JavaAPISuite.java
+++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java
@@ -43,9 +43,9 @@ public class JavaAPISuite implements Serializable {
     ssc = null;
 
     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
-    System.clearProperty("spark.master.port");
+    System.clearProperty("spark.driver.port");
   }
-
+  /*
   @Test
   public void testCount() {
     List<List<Integer>> inputData = Arrays.asList(
@@ -434,7 +434,7 @@ public class JavaAPISuite implements Serializable {
 
     assertOrderInvariantEquals(expected, result);
   }
-
+  */
   /*
    * Performs an order-invariant comparison of lists representing two RDD streams. This allows
    * us to account for ordering variation within individual RDD's which occurs during windowing.
@@ -450,7 +450,7 @@ public class JavaAPISuite implements Serializable {
     Assert.assertEquals(expected, actual);
   }
 
-
+  /*
   // PairDStream Functions
   @Test
   public void testPairFilter() {
@@ -897,7 +897,7 @@ public class JavaAPISuite implements Serializable {
 
     Assert.assertEquals(expected, result);
   }
-
+  */
   @Test
   public void testCheckpointMasterRecovery() throws InterruptedException {
     List<List<String>> inputData = Arrays.asList(
@@ -911,7 +911,6 @@ public class JavaAPISuite implements Serializable {
         Arrays.asList(1,4),
         Arrays.asList(8,7));
 
-
     File tempDir = Files.createTempDir();
     ssc.checkpoint(tempDir.getAbsolutePath(), new Duration(1000));
 
@@ -927,14 +926,16 @@ public class JavaAPISuite implements Serializable {
 
     assertOrderInvariantEquals(expectedInitial, initialResult);
     Thread.sleep(1000);
-
     ssc.stop();
+
     ssc = new JavaStreamingContext(tempDir.getAbsolutePath());
-    ssc.start();
-    List<List<Integer>> finalResult = JavaCheckpointTestUtils.runStreams(ssc, 2, 2);
-    assertOrderInvariantEquals(expectedFinal, finalResult);
+    // Tweak to take into consideration that the last batch before failure
+    // will be re-processed after recovery
+    List<List<Integer>> finalResult = JavaCheckpointTestUtils.runStreams(ssc, 2, 3);
+    assertOrderInvariantEquals(expectedFinal, finalResult.subList(1, 3));
   }
 
+
   /** TEST DISABLED: Pending a discussion about checkpoint() semantics with TD
   @Test
   public void testCheckpointofIndividualStream() throws InterruptedException {
@@ -963,7 +964,7 @@ public class JavaAPISuite implements Serializable {
     assertOrderInvariantEquals(expected, result1);
   }
   */
-
+  /*
   // Input stream tests. These mostly just test that we can instantiate a given InputStream with
   // Java arguments and assign it to a JavaDStream without producing type errors. Testing of the
   // InputStream functionality is deferred to the existing Scala tests.
@@ -1025,5 +1026,5 @@ public class JavaAPISuite implements Serializable {
   public void testFileStream() {
     JavaPairDStream<String, String> foo =
       ssc.<String, String, SequenceFileInputFormat>fileStream("/tmp/foo");
-  }
+  }*/
 }
diff --git a/streaming/src/test/java/JavaTestUtils.scala b/streaming/src/test/java/spark/streaming/JavaTestUtils.scala
index 56349837e5..56349837e5 100644
--- a/streaming/src/test/java/JavaTestUtils.scala
+++ b/streaming/src/test/java/spark/streaming/JavaTestUtils.scala
diff --git a/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
index d98b840b8e..c031949dd1 100644
--- a/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/BasicOperationsSuite.scala
@@ -10,7 +10,7 @@ class BasicOperationsSuite extends TestSuiteBase {
 
   after {
     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
-    System.clearProperty("spark.master.port")
+    System.clearProperty("spark.driver.port")
   }
 
   test("map") {
diff --git a/streaming/src/test/scala/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/spark/streaming/CheckpointSuite.scala
index d2f32c189b..7126af62d9 100644
--- a/streaming/src/test/scala/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/CheckpointSuite.scala
@@ -7,6 +7,8 @@ import org.scalatest.BeforeAndAfter
 import org.apache.commons.io.FileUtils
 import collection.mutable.{SynchronizedBuffer, ArrayBuffer}
 import util.{Clock, ManualClock}
+import scala.util.Random
+import com.google.common.io.Files
 
 class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
 
@@ -19,7 +21,7 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     FileUtils.deleteDirectory(new File(checkpointDir))
 
     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
-    System.clearProperty("spark.master.port")
+    System.clearProperty("spark.driver.port")
   }
 
   var ssc: StreamingContext = null
@@ -32,7 +34,7 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
 
   override def actuallyWait = true
 
-  test("basic stream+rdd recovery") {
+  test("basic rdd checkpoints + dstream graph checkpoint recovery") {
 
     assert(batchDuration === Milliseconds(500), "batchDuration for this test must be 1 second")
     assert(checkpointInterval === batchDuration, "checkpointInterval for this test much be same as batchDuration")
@@ -63,9 +65,9 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     // then check whether some RDD has been checkpointed or not
     ssc.start()
     runStreamsWithRealDelay(ssc, firstNumBatches)
-    logInfo("Checkpoint data of state stream = \n[" + stateStream.checkpointData.rdds.mkString(",\n") + "]")
-    assert(!stateStream.checkpointData.rdds.isEmpty, "No checkpointed RDDs in state stream before first failure")
-    stateStream.checkpointData.rdds.foreach {
+    logInfo("Checkpoint data of state stream = \n" + stateStream.checkpointData)
+    assert(!stateStream.checkpointData.checkpointFiles.isEmpty, "No checkpointed RDDs in state stream before first failure")
+    stateStream.checkpointData.checkpointFiles.foreach {
       case (time, data) => {
         val file = new File(data.toString)
         assert(file.exists(), "Checkpoint file '" + file +"' for time " + time + " for state stream before first failure does not exist")
@@ -74,7 +76,7 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
 
     // Run till a further time such that previous checkpoint files in the stream would be deleted
     // and check whether the earlier checkpoint files are deleted
-    val checkpointFiles = stateStream.checkpointData.rdds.map(x => new File(x._2.toString))
+    val checkpointFiles = stateStream.checkpointData.checkpointFiles.map(x => new File(x._2))
     runStreamsWithRealDelay(ssc, secondNumBatches)
     checkpointFiles.foreach(file => assert(!file.exists, "Checkpoint file '" + file + "' was not deleted"))
     ssc.stop()
@@ -91,8 +93,8 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     // is present in the checkpoint data or not
     ssc.start()
     runStreamsWithRealDelay(ssc, 1)
-    assert(!stateStream.checkpointData.rdds.isEmpty, "No checkpointed RDDs in state stream before second failure")
-    stateStream.checkpointData.rdds.foreach {
+    assert(!stateStream.checkpointData.checkpointFiles.isEmpty, "No checkpointed RDDs in state stream before second failure")
+    stateStream.checkpointData.checkpointFiles.foreach {
       case (time, data) => {
         val file = new File(data.toString)
         assert(file.exists(),
@@ -117,7 +119,10 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     ssc = null
   }
 
-  test("map and reduceByKey") {
+  // This tests whether the systm can recover from a master failure with simple
+  // non-stateful operations. This assumes as reliable, replayable input
+  // source - TestInputDStream.
+  test("recovery with map and reduceByKey operations") {
     testCheckpointedOperation(
       Seq( Seq("a", "a", "b"), Seq("", ""), Seq(), Seq("a", "a", "b"), Seq("", ""), Seq() ),
       (s: DStream[String]) => s.map(x => (x, 1)).reduceByKey(_ + _),
@@ -126,7 +131,11 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     )
   }
 
-  test("reduceByKeyAndWindowInv") {
+
+  // This tests whether the ReduceWindowedDStream's RDD checkpoints works correctly such
+  // that the system can recover from a master failure. This assumes as reliable,
+  // replayable input source - TestInputDStream.
+  test("recovery with invertible reduceByKeyAndWindow operation") {
     val n = 10
     val w = 4
     val input = (1 to n).map(_ => Seq("a")).toSeq
@@ -139,7 +148,11 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     testCheckpointedOperation(input, operation, output, 7)
   }
 
-  test("updateStateByKey") {
+
+  // This tests whether the StateDStream's RDD checkpoints works correctly such
+  // that the system can recover from a master failure. This assumes as reliable,
+  // replayable input source - TestInputDStream.
+  test("recovery with updateStateByKey operation") {
     val input = (1 to 10).map(_ => Seq("a")).toSeq
     val output = (1 to 10).map(x => Seq(("a", x))).toSeq
     val operation = (st: DStream[String]) => {
@@ -154,11 +167,99 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     testCheckpointedOperation(input, operation, output, 7)
   }
 
+  // This tests whether file input stream remembers what files were seen before
+  // the master failure and uses them again to process a large window operatoin.
+  // It also tests whether batches, whose processing was incomplete due to the
+  // failure, are re-processed or not.
+  test("recovery with file input stream") {
+    // Set up the streaming context and input streams
+    val testDir = Files.createTempDir()
+    var ssc = new StreamingContext(master, framework, batchDuration)
+    ssc.checkpoint(checkpointDir, checkpointInterval)
+    val fileStream = ssc.textFileStream(testDir.toString)
+    // Making value 3 take large time to process, to ensure that the master
+    // shuts down in the middle of processing the 3rd batch
+    val mappedStream = fileStream.map(s => {
+      val i = s.toInt
+      if (i == 3) Thread.sleep(1000)
+      i
+    })
+    // Reducing over a large window to ensure that recovery from master failure
+    // requires reprocessing of all the files seen before the failure
+    val reducedStream = mappedStream.reduceByWindow(_ + _, batchDuration * 30, batchDuration)
+    val outputBuffer = new ArrayBuffer[Seq[Int]]
+    var outputStream = new TestOutputStream(reducedStream, outputBuffer)
+    ssc.registerOutputStream(outputStream)
+    ssc.start()
+
+    // Create files and advance manual clock to process them
+    var clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+    Thread.sleep(1000)
+    for (i <- Seq(1, 2, 3)) {
+      FileUtils.writeStringToFile(new File(testDir, i.toString), i.toString + "\n")
+      // wait to make sure that the file is written such that it gets shown in the file listings
+      Thread.sleep(500)
+      clock.addToTime(batchDuration.milliseconds)
+      // wait to make sure that FileInputDStream picks up this file only and not any other file
+      Thread.sleep(500)
+    }
+    logInfo("Output = " + outputStream.output.mkString(","))
+    assert(outputStream.output.size > 0, "No files processed before restart")
+    ssc.stop()
+
+    // Create files while the master is down
+    for (i <- Seq(4, 5, 6)) {
+      FileUtils.writeStringToFile(new File(testDir, i.toString), i.toString + "\n")
+      Thread.sleep(1000)
+    }
+
+    // Restart stream computation from checkpoint and create more files to see whether
+    // they are being processed
+    logInfo("*********** RESTARTING ************")
+    ssc = new StreamingContext(checkpointDir)
+    ssc.start()
+    clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+    for (i <- Seq(7, 8, 9)) {
+      FileUtils.writeStringToFile(new File(testDir, i.toString), i.toString + "\n")
+      Thread.sleep(500)
+      clock.addToTime(batchDuration.milliseconds)
+      Thread.sleep(500)
+    }
+    Thread.sleep(1000)
+    logInfo("Output = " + outputStream.output.mkString(","))
+    assert(outputStream.output.size > 0, "No files processed after restart")
+    ssc.stop()
+
+    // Append the new output to the old buffer
+    outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[Int]]
+    outputBuffer ++= outputStream.output
+
+    // Verify whether data received by Spark Streaming was as expected
+    val expectedOutput = Seq(1, 3, 6, 28, 36, 45)
+    logInfo("--------------------------------")
+    logInfo("output, size = " + outputBuffer.size)
+    outputBuffer.foreach(x => logInfo("[" + x.mkString(",") + "]"))
+    logInfo("expected output, size = " + expectedOutput.size)
+    expectedOutput.foreach(x => logInfo("[" + x + "]"))
+    logInfo("--------------------------------")
+
+    // Verify whether all the elements received are as expected
+    assert(outputBuffer.size === expectedOutput.size)
+    for (i <- 0 until outputBuffer.size) {
+      assert(outputBuffer(i).size === 1)
+      assert(outputBuffer(i).head === expectedOutput(i))
+    }
+  }
+
+
   /**
-   * Tests a streaming operation under checkpointing, by restart the operation
+   * Tests a streaming operation under checkpointing, by restarting the operation
    * from checkpoint file and verifying whether the final output is correct.
    * The output is assumed to have come from a reliable queue which an replay
    * data as required.
+   *
+   * NOTE: This takes into consideration that the last batch processed before
+   * master failure will be re-processed after restart/recovery.
    */
   def testCheckpointedOperation[U: ClassManifest, V: ClassManifest](
     input: Seq[Seq[U]],
@@ -172,7 +273,8 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     val totalNumBatches = input.size
     val nextNumBatches = totalNumBatches - initialNumBatches
     val initialNumExpectedOutputs = initialNumBatches
-    val nextNumExpectedOutputs = expectedOutput.size - initialNumExpectedOutputs
+    val nextNumExpectedOutputs = expectedOutput.size - initialNumExpectedOutputs + 1
+    // because the last batch will be processed again
 
     // Do the computation for initial number of batches, create checkpoint file and quit
     ssc = setupStreams[U, V](input, operation)
@@ -188,6 +290,7 @@ class CheckpointSuite extends TestSuiteBase with BeforeAndAfter {
     )
     ssc = new StreamingContext(checkpointDir)
     val outputNew = runStreams[V](ssc, nextNumBatches, nextNumExpectedOutputs)
+    // the first element will be re-processed data of the last batch before restart
     verifyOutput[V](outputNew, expectedOutput.takeRight(nextNumExpectedOutputs), true)
     ssc = null
   }
diff --git a/streaming/src/test/scala/spark/streaming/FailureSuite.scala b/streaming/src/test/scala/spark/streaming/FailureSuite.scala
index 7493ac1207..c4cfffbfc1 100644
--- a/streaming/src/test/scala/spark/streaming/FailureSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/FailureSuite.scala
@@ -24,7 +24,7 @@ class FailureSuite extends TestSuiteBase with BeforeAndAfter {
     FileUtils.deleteDirectory(new File(checkpointDir))
 
     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
-    System.clearProperty("spark.master.port")
+    System.clearProperty("spark.driver.port")
   }
 
   override def framework = "CheckpointSuite"
diff --git a/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
index d7ba7a5d17..c442210004 100644
--- a/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/InputStreamsSuite.scala
@@ -19,35 +19,24 @@ import org.apache.avro.ipc.specific.SpecificRequestor
 import java.nio.ByteBuffer
 import collection.JavaConversions._
 import java.nio.charset.Charset
+import com.google.common.io.Files
 
 class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     
   System.setProperty("spark.streaming.clock", "spark.streaming.util.ManualClock")
 
-  val testPort = 9999
-  var testServer: TestServer = null
-  var testDir: File = null
-
   override def checkpointDir = "checkpoint"
 
   after {
-    FileUtils.deleteDirectory(new File(checkpointDir))
-    if (testServer != null) {
-      testServer.stop()
-      testServer = null
-    }
-    if (testDir != null && testDir.exists()) {
-      FileUtils.deleteDirectory(testDir)
-      testDir = null
-    }
-
     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
-    System.clearProperty("spark.master.port")
+    System.clearProperty("spark.driver.port")
   }
 
+
   test("network input stream") {
     // Start the server
-    testServer = new TestServer(testPort)
+    val testPort = 9999
+    val testServer = new TestServer(testPort)
     testServer.start()
 
     // Set up the streaming context and input streams
@@ -93,46 +82,6 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     }
   }
 
-  test("network input stream with checkpoint") {
-    // Start the server
-    testServer = new TestServer(testPort)
-    testServer.start()
-
-    // Set up the streaming context and input streams
-    var ssc = new StreamingContext(master, framework, batchDuration)
-    ssc.checkpoint(checkpointDir, checkpointInterval)
-    val networkStream = ssc.networkTextStream("localhost", testPort, StorageLevel.MEMORY_AND_DISK)
-    var outputStream = new TestOutputStream(networkStream, new ArrayBuffer[Seq[String]])
-    ssc.registerOutputStream(outputStream)
-    ssc.start()
-
-    // Feed data to the server to send to the network receiver
-    var clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    for (i <- Seq(1, 2, 3)) {
-      testServer.send(i.toString + "\n")
-      Thread.sleep(100)
-      clock.addToTime(batchDuration.milliseconds)
-    }
-    Thread.sleep(500)
-    assert(outputStream.output.size > 0)
-    ssc.stop()
-
-    // Restart stream computation from checkpoint and feed more data to see whether
-    // they are being received and processed
-    logInfo("*********** RESTARTING ************")
-    ssc = new StreamingContext(checkpointDir)
-    ssc.start()
-    clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    for (i <- Seq(4, 5, 6)) {
-      testServer.send(i.toString + "\n")
-      Thread.sleep(100)
-      clock.addToTime(batchDuration.milliseconds)
-    }
-    Thread.sleep(500)
-    outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[String]]
-    assert(outputStream.output.size > 0)
-    ssc.stop()
-  }
 
   test("flume input stream") {
     // Set up the streaming context and input streams
@@ -182,18 +131,10 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
     }
   }
 
-  test("file input stream") {
-
-    // Create a temporary directory
-    testDir = {
-      var temp = File.createTempFile(".temp.", Random.nextInt().toString)
-      temp.delete()
-      temp.mkdirs()
-      logInfo("Created temp dir " + temp)
-      temp
-    }
 
+  test("file input stream") {
     // Set up the streaming context and input streams
+    val testDir = Files.createTempDir()
     val ssc = new StreamingContext(master, framework, batchDuration)
     val filestream = ssc.textFileStream(testDir.toString)
     val outputBuffer = new ArrayBuffer[Seq[String]] with SynchronizedBuffer[Seq[String]]
@@ -214,10 +155,6 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
       //Thread.sleep(100)
     }
     val startTime = System.currentTimeMillis()
-    /*while (output.size < expectedOutput.size && System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
-      logInfo("output.size = " + output.size + ", expectedOutput.size = " + expectedOutput.size)
-      Thread.sleep(100)
-    }*/
     Thread.sleep(1000)
     val timeTaken = System.currentTimeMillis() - startTime
     assert(timeTaken < maxWaitTimeMillis, "Operation timed out after " + timeTaken + " ms")
@@ -226,11 +163,9 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
 
     // Verify whether data received by Spark Streaming was as expected
     logInfo("--------------------------------")
-    logInfo("output.size = " + outputBuffer.size)
-    logInfo("output")
+    logInfo("output, size = " + outputBuffer.size)
     outputBuffer.foreach(x => logInfo("[" + x.mkString(",") + "]"))
-    logInfo("expected output.size = " + expectedOutput.size)
-    logInfo("expected output")
+    logInfo("expected output, size = " + expectedOutput.size)
     expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
     logInfo("--------------------------------")
 
@@ -241,56 +176,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter {
       assert(output(i).size === 1)
       assert(output(i).head.toString === expectedOutput(i))
     }
-  }
-
-  test("file input stream with checkpoint") {
-    // Create a temporary directory
-    testDir = {
-      var temp = File.createTempFile(".temp.", Random.nextInt().toString)
-      temp.delete()
-      temp.mkdirs()
-      logInfo("Created temp dir " + temp)
-      temp
-    }
-
-    // Set up the streaming context and input streams
-    var ssc = new StreamingContext(master, framework, batchDuration)
-    ssc.checkpoint(checkpointDir, checkpointInterval)
-    val filestream = ssc.textFileStream(testDir.toString)
-    var outputStream = new TestOutputStream(filestream, new ArrayBuffer[Seq[String]])
-    ssc.registerOutputStream(outputStream)
-    ssc.start()
-
-    // Create files and advance manual clock to process them
-    var clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    Thread.sleep(1000)
-    for (i <- Seq(1, 2, 3)) {
-      FileUtils.writeStringToFile(new File(testDir, i.toString), i.toString + "\n")
-      Thread.sleep(100)
-      clock.addToTime(batchDuration.milliseconds)
-    }
-    Thread.sleep(500)
-    logInfo("Output = " + outputStream.output.mkString(","))
-    assert(outputStream.output.size > 0)
-    ssc.stop()
-
-    // Restart stream computation from checkpoint and create more files to see whether
-    // they are being processed
-    logInfo("*********** RESTARTING ************")
-    ssc = new StreamingContext(checkpointDir)
-    ssc.start()
-    clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
-    Thread.sleep(500)
-    for (i <- Seq(4, 5, 6)) {
-      FileUtils.writeStringToFile(new File(testDir, i.toString), i.toString + "\n")
-      Thread.sleep(100)
-      clock.addToTime(batchDuration.milliseconds)
-    }
-    Thread.sleep(500)
-    outputStream = ssc.graph.getOutputStreams().head.asInstanceOf[TestOutputStream[String]]
-    logInfo("Output = " + outputStream.output.mkString(","))
-    assert(outputStream.output.size > 0)
-    ssc.stop()
+    FileUtils.deleteDirectory(testDir)
   }
 }
 
diff --git a/streaming/src/test/scala/spark/streaming/WindowOperationsSuite.scala b/streaming/src/test/scala/spark/streaming/WindowOperationsSuite.scala
index 0c6e928835..cd9608df53 100644
--- a/streaming/src/test/scala/spark/streaming/WindowOperationsSuite.scala
+++ b/streaming/src/test/scala/spark/streaming/WindowOperationsSuite.scala
@@ -13,7 +13,7 @@ class WindowOperationsSuite extends TestSuiteBase {
 
   after {
     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
-    System.clearProperty("spark.master.port")
+    System.clearProperty("spark.driver.port")
   }
 
   val largerSlideInput = Seq(
author	Tathagata Das <tathagata.das1565@gmail.com>	2013-02-07 15:05:20 -0800
committer	Tathagata Das <tathagata.das1565@gmail.com>	2013-02-07 15:05:20 -0800
commit	bcee3cb2db2f213efdc142d83419dfd9e35bc4fe (patch)
tree	0aa12b24f0074f3c6522a8ab3c16bf8ac9097061 /streaming/src
parent	12300758ccbe7d45c6149d56772133ae1bc5cb25 (diff)
parent	4cc223b4785c9da39c4a35d2adb7339dfa8e47e6 (diff)
download	spark-bcee3cb2db2f213efdc142d83419dfd9e35bc4fe.tar.gz spark-bcee3cb2db2f213efdc142d83419dfd9e35bc4fe.tar.bz2 spark-bcee3cb2db2f213efdc142d83419dfd9e35bc4fe.zip