diff options
Diffstat (limited to 'python/pyspark/context.py')
-rw-r--r-- | python/pyspark/context.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/pyspark/context.py b/python/pyspark/context.py index b6c991453d..ec67ec8d0f 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -29,7 +29,7 @@ from pyspark.conf import SparkConf from pyspark.files import SparkFiles from pyspark.java_gateway import launch_gateway from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer, \ - PairDeserializer, CompressedSerializer, AutoBatchedSerializer, NoOpSerializer + PairDeserializer, AutoBatchedSerializer, NoOpSerializer, LargeObjectSerializer from pyspark.storagelevel import StorageLevel from pyspark.rdd import RDD from pyspark.traceback_utils import CallSite, first_spark_call @@ -624,7 +624,8 @@ class SparkContext(object): object for reading it in distributed functions. The variable will be sent to each cluster only once. """ - ser = CompressedSerializer(PickleSerializer()) + ser = LargeObjectSerializer() + # pass large object by py4j is very slow and need much memory tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) ser.dump_stream([value], tempFile) |