aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
authorDavies Liu <davies.liu@gmail.com>2014-10-01 11:21:34 -0700
committerJosh Rosen <joshrosen@apache.org>2014-10-01 11:21:34 -0700
commitabf588f47a26d0066f0b75d52b200a87bb085064 (patch)
treed3e9b8a57688805262b7d00c99f596327912ea23 /python/pyspark/rdd.py
parent0bfd3afb00936b0f46ba613be0982e38bc7032b5 (diff)
downloadspark-abf588f47a26d0066f0b75d52b200a87bb085064.tar.gz
spark-abf588f47a26d0066f0b75d52b200a87bb085064.tar.bz2
spark-abf588f47a26d0066f0b75d52b200a87bb085064.zip
[SPARK-3749] [PySpark] fix bugs in broadcast large closure of RDD
1. broadcast is triggle unexpected 2. fd is leaked in JVM (also leak in parallelize()) 3. broadcast is not unpersisted in JVM after RDD is not be used any more. cc JoshRosen , sorry for these stupid bugs. Author: Davies Liu <davies.liu@gmail.com> Closes #2603 from davies/fix_broadcast and squashes the following commits: 080a743 [Davies Liu] fix bugs in broadcast large closure of RDD
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 8ed89e2f97..dc6497772e 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2073,6 +2073,12 @@ class PipelinedRDD(RDD):
self._jrdd_deserializer = self.ctx.serializer
self._bypass_serializer = False
self._partitionFunc = prev._partitionFunc if self.preservesPartitioning else None
+ self._broadcast = None
+
+ def __del__(self):
+ if self._broadcast:
+ self._broadcast.unpersist()
+ self._broadcast = None
@property
def _jrdd(self):
@@ -2087,9 +2093,9 @@ class PipelinedRDD(RDD):
# the serialized command will be compressed by broadcast
ser = CloudPickleSerializer()
pickled_command = ser.dumps(command)
- if pickled_command > (1 << 20): # 1M
- broadcast = self.ctx.broadcast(pickled_command)
- pickled_command = ser.dumps(broadcast)
+ if len(pickled_command) > (1 << 20): # 1M
+ self._broadcast = self.ctx.broadcast(pickled_command)
+ pickled_command = ser.dumps(self._broadcast)
broadcast_vars = ListConverter().convert(
[x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
self.ctx._gateway._gateway_client)