From c246b95dd2f565043db429c38c6cc029a0b870c1 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Mon, 15 Dec 2014 22:58:26 -0800 Subject: [SPARK-4841] fix zip with textFile() UTF8Deserializer can not be used in BatchedSerializer, so always use PickleSerializer() when change batchSize in zip(). Also, if two RDD have the same batch size already, they did not need re-serialize any more. Author: Davies Liu Closes #3706 from davies/fix_4841 and squashes the following commits: 20ce3a3 [Davies Liu] fix bug in _reserialize() e3ebf7c [Davies Liu] add comment 379d2c8 [Davies Liu] fix zip with textFile() --- python/pyspark/serializers.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'python/pyspark/serializers.py') diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 33aa55f7f1..bd08c9a6d2 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -463,6 +463,9 @@ class CompressedSerializer(FramedSerializer): def loads(self, obj): return self.serializer.loads(zlib.decompress(obj)) + def __eq__(self, other): + return isinstance(other, CompressedSerializer) and self.serializer == other.serializer + class UTF8Deserializer(Serializer): @@ -489,6 +492,9 @@ class UTF8Deserializer(Serializer): except EOFError: return + def __eq__(self, other): + return isinstance(other, UTF8Deserializer) and self.use_unicode == other.use_unicode + def read_long(stream): length = stream.read(8) -- cgit v1.2.3