Merge pull request #373 from Reinvigorate/sm-updateStateByKey

StateDStream changes to give updateStateByKey consistent behavior
author: Tathagata Das <tathagata.das1565@gmail.com> 2013-02-07 11:59:19 -0800
committer: Tathagata Das <tathagata.das1565@gmail.com> 2013-02-07 11:59:19 -0800
commit: 915d9931fea72296ed50765ad3f7a88d254d7d60 (patch)
tree: 23244219b4c19f3eb2b92dfa341b29d69533e4df /streaming/src/main
parent: 86057ec7c868262763d1e31b3f3c94bd43eeafb3 (diff)
parent: c0694291c81ad775918421941a80a00ca9593a38 (diff)
download: spark-915d9931fea72296ed50765ad3f7a88d254d7d60.tar.gz
spark-915d9931fea72296ed50765ad3f7a88d254d7d60.tar.bz2
spark-915d9931fea72296ed50765ad3f7a88d254d7d60.zip
2 files changed, 28 insertions, 6 deletions
diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
index 14500bdcb1..3cec35cb37 100644
--- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala
@@ -283,17 +283,31 @@ class StreamingContext private (
   }
 
   /**
-   * Creates a input stream from an queue of RDDs. In each batch,
+   * Creates an input stream from a queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    * @param queue      Queue of RDDs
    * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
-   * @param defaultRDD Default RDD is returned by the DStream when the queue is empty
    * @tparam T         Type of objects in the RDD
    */
   def queueStream[T: ClassManifest](
       queue: Queue[RDD[T]],
-      oneAtATime: Boolean = true,
-      defaultRDD: RDD[T] = null
+      oneAtATime: Boolean = true
+    ): DStream[T] = {
+    queueStream(queue, oneAtATime, sc.makeRDD(Seq[T](), 1))
+  }
+
+  /**
+   * Creates an input stream from a queue of RDDs. In each batch,
+   * it will process either one or all of the RDDs returned by the queue.
+   * @param queue      Queue of RDDs
+   * @param oneAtATime Whether only one RDD should be consumed from the queue in every interval
+   * @param defaultRDD Default RDD is returned by the DStream when the queue is empty. Set as null if no RDD should be returned when empty
+   * @tparam T         Type of objects in the RDD
+   */
+  def queueStream[T: ClassManifest](
+      queue: Queue[RDD[T]],
+      oneAtATime: Boolean,
+      defaultRDD: RDD[T]
     ): DStream[T] = {
     val inputStream = new QueueInputDStream(this, queue, oneAtATime, defaultRDD)
     registerInputStream(inputStream)
diff --git a/streaming/src/main/scala/spark/streaming/dstream/StateDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/StateDStream.scala
index b4506c74aa..db62955036 100644
--- a/streaming/src/main/scala/spark/streaming/dstream/StateDStream.scala
+++ b/streaming/src/main/scala/spark/streaming/dstream/StateDStream.scala
@@ -48,8 +48,16 @@ class StateDStream[K: ClassManifest, V: ClassManifest, S: ClassManifest](
             //logDebug("Generating state RDD for time " + validTime)
             return Some(stateRDD)
           }
-          case None => {    // If parent RDD does not exist, then return old state RDD
-            return Some(prevStateRDD)
+          case None => {    // If parent RDD does not exist
+
+            // Re-apply the update function to the old state RDD
+            val updateFuncLocal = updateFunc
+            val finalFunc = (iterator: Iterator[(K, S)]) => {
+              val i = iterator.map(t => (t._1, Seq[V](), Option(t._2)))
+              updateFuncLocal(i)
+            }
+            val stateRDD = prevStateRDD.mapPartitions(finalFunc, preservePartitioning)
+            return Some(stateRDD)
           }
         }
       }
author	Tathagata Das <tathagata.das1565@gmail.com>	2013-02-07 11:59:19 -0800
committer	Tathagata Das <tathagata.das1565@gmail.com>	2013-02-07 11:59:19 -0800
commit	915d9931fea72296ed50765ad3f7a88d254d7d60 (patch)
tree	23244219b4c19f3eb2b92dfa341b29d69533e4df /streaming/src/main
parent	86057ec7c868262763d1e31b3f3c94bd43eeafb3 (diff)
parent	c0694291c81ad775918421941a80a00ca9593a38 (diff)
download	spark-915d9931fea72296ed50765ad3f7a88d254d7d60.tar.gz spark-915d9931fea72296ed50765ad3f7a88d254d7d60.tar.bz2 spark-915d9931fea72296ed50765ad3f7a88d254d7d60.zip