aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/serializers.py
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@apache.org>2014-01-28 19:50:26 -0800
committerJosh Rosen <joshrosen@apache.org>2014-01-28 20:20:08 -0800
commit1381fc72f7a34f690a98ab72cec8ffb61e0e564d (patch)
tree8ae129c4b291b4b5589a77b919f508c4535fbf2c /python/pyspark/serializers.py
parent84670f2715392859624df290c1b52eb4ed4a9cb1 (diff)
downloadspark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.tar.gz
spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.tar.bz2
spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.zip
Switch from MUTF8 to UTF8 in PySpark serializers.
This fixes SPARK-1043, a bug introduced in 0.9.0 where PySpark couldn't serialize strings > 64kB. This fix was written by @tyro89 and @bouk in #512. This commit squashes and rebases their pull request in order to fix some merge conflicts.
Diffstat (limited to 'python/pyspark/serializers.py')
-rw-r--r--python/pyspark/serializers.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 2a500ab919..8c6ad79059 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -261,13 +261,13 @@ class MarshalSerializer(FramedSerializer):
loads = marshal.loads
-class MUTF8Deserializer(Serializer):
+class UTF8Deserializer(Serializer):
"""
- Deserializes streams written by Java's DataOutputStream.writeUTF().
+ Deserializes streams written by getBytes.
"""
def loads(self, stream):
- length = struct.unpack('>H', stream.read(2))[0]
+ length = read_int(stream)
return stream.read(length).decode('utf8')
def load_stream(self, stream):