aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-01-29 17:28:37 -0800
committerJosh Rosen <joshrosen@databricks.com>2015-02-16 20:35:02 -0800
commita39da171cb7fea2f32367edd60c2644aadb88282 (patch)
tree7c693f911f534342ebaed5f173161674fbb348f0
parentf468688f1775b4dbc19cd8c5d466a6a2891a1358 (diff)
downloadspark-a39da171cb7fea2f32367edd60c2644aadb88282.tar.gz
spark-a39da171cb7fea2f32367edd60c2644aadb88282.tar.bz2
spark-a39da171cb7fea2f32367edd60c2644aadb88282.zip
[SPARK-5395] [PySpark] fix python process leak while coalesce()
Currently, the Python process is released into pool only after the task had finished, it cause many process forked if coalesce() is called. This PR will change it to release the process as soon as read all the data from it (finish the partition), then a process could be reused to process multiple partitions in a single task. Author: Davies Liu <davies@databricks.com> Closes #4238 from davies/py_leak and squashes the following commits: ec80a43 [Davies Liu] add @volatile 6da437a [Davies Liu] address comments 24ed322 [Davies Liu] fix python process leak while coalesce() (cherry picked from commit 5c746eedda8cff2fc1692cf6dce376f4b0ca6fac) Signed-off-by: Josh Rosen <joshrosen@databricks.com>
-rw-r--r--core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala13
1 files changed, 8 insertions, 5 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 2b6788d404..0d508d624f 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -68,17 +68,16 @@ private[spark] class PythonRDD(
envVars += ("SPARK_REUSE_WORKER" -> "1")
}
val worker: Socket = env.createPythonWorker(pythonExec, envVars.toMap)
+ // Whether is the worker released into idle pool
+ @volatile var released = false
// Start a thread to feed the process input from our parent's iterator
val writerThread = new WriterThread(env, worker, split, context)
- var complete_cleanly = false
context.addTaskCompletionListener { context =>
writerThread.shutdownOnTaskCompletion()
writerThread.join()
- if (reuse_worker && complete_cleanly) {
- env.releasePythonWorker(pythonExec, envVars.toMap, worker)
- } else {
+ if (!reuse_worker || !released) {
try {
worker.close()
} catch {
@@ -146,8 +145,12 @@ private[spark] class PythonRDD(
stream.readFully(update)
accumulator += Collections.singletonList(update)
}
+ // Check whether the worker is ready to be re-used.
if (stream.readInt() == SpecialLengths.END_OF_STREAM) {
- complete_cleanly = true
+ if (reuse_worker) {
+ env.releasePythonWorker(pythonExec, envVars.toMap, worker)
+ released = true
+ }
}
null
}