From 6a6c1fc5c807ba4e8aba3e260537aa527ff5d46a Mon Sep 17 00:00:00 2001
From: Bryan Cutler <bjcutler@us.ibm.com>
Date: Thu, 10 Dec 2015 14:21:15 -0800
Subject: [SPARK-11713] [PYSPARK] [STREAMING] Initial RDD updateStateByKey for
 PySpark

Adding ability to define an initial state RDD for use with updateStateByKey PySpark.  Added unit test and changed stateful_network_wordcount example to use initial RDD.

Author: Bryan Cutler <bjcutler@us.ibm.com>

Closes #10082 from BryanCutler/initial-rdd-updateStateByKey-SPARK-11713.
---
 python/pyspark/streaming/dstream.py | 13 +++++++++++--
 python/pyspark/streaming/tests.py   | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'python/pyspark')

diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py
index acec850f02..f61137cb88 100644
--- a/python/pyspark/streaming/dstream.py
+++ b/python/pyspark/streaming/dstream.py
@@ -568,7 +568,7 @@ class DStream(object):
                                                              self._ssc._jduration(slideDuration))
         return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
 
-    def updateStateByKey(self, updateFunc, numPartitions=None):
+    def updateStateByKey(self, updateFunc, numPartitions=None, initialRDD=None):
         """
         Return a new "state" DStream where the state for each key is updated by applying
         the given function on the previous state of the key and the new values of the key.
@@ -579,6 +579,9 @@ class DStream(object):
         if numPartitions is None:
             numPartitions = self._sc.defaultParallelism
 
+        if initialRDD and not isinstance(initialRDD, RDD):
+            initialRDD = self._sc.parallelize(initialRDD)
+
         def reduceFunc(t, a, b):
             if a is None:
                 g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None))
@@ -590,7 +593,13 @@ class DStream(object):
 
         jreduceFunc = TransformFunction(self._sc, reduceFunc,
                                         self._sc.serializer, self._jrdd_deserializer)
-        dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
+        if initialRDD:
+            initialRDD = initialRDD._reserialize(self._jrdd_deserializer)
+            dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc,
+                                                       initialRDD._jrdd)
+        else:
+            dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)
+
         return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
 
 
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index a2bfd79e1a..4949cd68e3 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -403,6 +403,26 @@ class BasicOperationTests(PySparkStreamingTestCase):
         expected = [[('k', v)] for v in expected]
         self._test_func(input, func, expected)
 
+    def test_update_state_by_key_initial_rdd(self):
+
+        def updater(vs, s):
+            if not s:
+                s = []
+            s.extend(vs)
+            return s
+
+        initial = [('k', [0, 1])]
+        initial = self.sc.parallelize(initial, 1)
+
+        input = [[('k', i)] for i in range(2, 5)]
+
+        def func(dstream):
+            return dstream.updateStateByKey(updater, initialRDD=initial)
+
+        expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]]
+        expected = [[('k', v)] for v in expected]
+        self._test_func(input, func, expected)
+
     def test_failed_func(self):
         # Test failure in
         # TransformFunction.apply(rdd: Option[RDD[_]], time: Time)
-- 
cgit v1.2.3