diff options
author | Josh Rosen <joshrosen@apache.org> | 2014-01-28 19:50:26 -0800 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-01-28 20:20:08 -0800 |
commit | 1381fc72f7a34f690a98ab72cec8ffb61e0e564d (patch) | |
tree | 8ae129c4b291b4b5589a77b919f508c4535fbf2c /python/pyspark/context.py | |
parent | 84670f2715392859624df290c1b52eb4ed4a9cb1 (diff) | |
download | spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.tar.gz spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.tar.bz2 spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.zip |
Switch from MUTF8 to UTF8 in PySpark serializers.
This fixes SPARK-1043, a bug introduced in 0.9.0
where PySpark couldn't serialize strings > 64kB.
This fix was written by @tyro89 and @bouk in #512.
This commit squashes and rebases their pull request
in order to fix some merge conflicts.
Diffstat (limited to 'python/pyspark/context.py')
-rw-r--r-- | python/pyspark/context.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/pyspark/context.py b/python/pyspark/context.py index f955aad7a4..f318b5d9a7 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -27,7 +27,7 @@ from pyspark.broadcast import Broadcast from pyspark.conf import SparkConf from pyspark.files import SparkFiles from pyspark.java_gateway import launch_gateway -from pyspark.serializers import PickleSerializer, BatchedSerializer, MUTF8Deserializer +from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer from pyspark.storagelevel import StorageLevel from pyspark.rdd import RDD @@ -234,7 +234,7 @@ class SparkContext(object): """ minSplits = minSplits or min(self.defaultParallelism, 2) return RDD(self._jsc.textFile(name, minSplits), self, - MUTF8Deserializer()) + UTF8Deserializer()) def _checkpointFile(self, name, input_deserializer): jrdd = self._jsc.checkpointFile(name) |