[SPARK-2790] [PySpark] fix zip with serializers which have different batch sizes.

If two RDDs have different batch size in serializers, then it will try to re-serialize the one with smaller batch size, then call RDD.zip() in Spark. Author: Davies Liu <davies.liu@gmail.com> Closes #1894 from davies/zip and squashes the following commits: c4652ea [Davies Liu] add more test cases 6d05fc8 [Davies Liu] Merge branch 'master' into zip 813b1e4 [Davies Liu] add more tests for failed cases a4aafda [Davies Liu] fix zip with serializers which have different batch sizes.
author: Davies Liu <davies.liu@gmail.com> 2014-08-19 14:46:32 -0700
committer: Josh Rosen <joshrosen@apache.org> 2014-08-19 14:46:32 -0700
commit: d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb (patch)
tree: bff49107d4452ea55eb7df8b1d44bae2ee0b9baa /python/pyspark/rdd.py
parent: 76eaeb4523ee01cabbea2d867daac48a277885a1 (diff)
download: spark-d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb.tar.gz
spark-d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb.tar.bz2
spark-d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb.zip
1 files changed, 25 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 86cd89b245..140cbe05a4 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1687,6 +1687,31 @@ class RDD(object):
         >>> x.zip(y).collect()
         [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]
         """
+        if self.getNumPartitions() != other.getNumPartitions():
+            raise ValueError("Can only zip with RDD which has the same number of partitions")
+
+        def get_batch_size(ser):
+            if isinstance(ser, BatchedSerializer):
+                return ser.batchSize
+            return 0
+
+        def batch_as(rdd, batchSize):
+            ser = rdd._jrdd_deserializer
+            if isinstance(ser, BatchedSerializer):
+                ser = ser.serializer
+            return rdd._reserialize(BatchedSerializer(ser, batchSize))
+
+        my_batch = get_batch_size(self._jrdd_deserializer)
+        other_batch = get_batch_size(other._jrdd_deserializer)
+        if my_batch != other_batch:
+            # use the greatest batchSize to batch the other one.
+            if my_batch > other_batch:
+                other = batch_as(other, my_batch)
+            else:
+                self = batch_as(self, other_batch)
+
+        # There will be an Exception in JVM if there are different number
+        # of items in each partitions.
         pairRDD = self._jrdd.zip(other._jrdd)
         deserializer = PairDeserializer(self._jrdd_deserializer,
                                         other._jrdd_deserializer)
author	Davies Liu <davies.liu@gmail.com>	2014-08-19 14:46:32 -0700
committer	Josh Rosen <joshrosen@apache.org>	2014-08-19 14:46:32 -0700
commit	d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb (patch)
tree	bff49107d4452ea55eb7df8b1d44bae2ee0b9baa /python/pyspark/rdd.py
parent	76eaeb4523ee01cabbea2d867daac48a277885a1 (diff)
download	spark-d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb.tar.gz spark-d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb.tar.bz2 spark-d7e80c2597d4a9cae2e0cb35a86f7889323f4cbb.zip