diff options
author | Josh Rosen <joshrosen@apache.org> | 2014-01-28 19:50:26 -0800 |
---|---|---|
committer | Josh Rosen <joshrosen@apache.org> | 2014-01-28 20:20:08 -0800 |
commit | 1381fc72f7a34f690a98ab72cec8ffb61e0e564d (patch) | |
tree | 8ae129c4b291b4b5589a77b919f508c4535fbf2c /python/pyspark/worker.py | |
parent | 84670f2715392859624df290c1b52eb4ed4a9cb1 (diff) | |
download | spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.tar.gz spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.tar.bz2 spark-1381fc72f7a34f690a98ab72cec8ffb61e0e564d.zip |
Switch from MUTF8 to UTF8 in PySpark serializers.
This fixes SPARK-1043, a bug introduced in 0.9.0
where PySpark couldn't serialize strings > 64kB.
This fix was written by @tyro89 and @bouk in #512.
This commit squashes and rebases their pull request
in order to fix some merge conflicts.
Diffstat (limited to 'python/pyspark/worker.py')
-rw-r--r-- | python/pyspark/worker.py | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index d77981f61f..4be4063dcf 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -30,11 +30,11 @@ from pyspark.broadcast import Broadcast, _broadcastRegistry from pyspark.cloudpickle import CloudPickler from pyspark.files import SparkFiles from pyspark.serializers import write_with_length, write_int, read_long, \ - write_long, read_int, SpecialLengths, MUTF8Deserializer, PickleSerializer + write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer pickleSer = PickleSerializer() -mutf8_deserializer = MUTF8Deserializer() +utf8_deserializer = UTF8Deserializer() def report_times(outfile, boot, init, finish): @@ -51,7 +51,7 @@ def main(infile, outfile): return # fetch name of workdir - spark_files_dir = mutf8_deserializer.loads(infile) + spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True @@ -66,7 +66,7 @@ def main(infile, outfile): sys.path.append(spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): - filename = mutf8_deserializer.loads(infile) + filename = utf8_deserializer.loads(infile) sys.path.append(os.path.join(spark_files_dir, filename)) command = pickleSer._read_with_length(infile) |