aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorDavies Liu <davies.liu@gmail.com>2014-08-27 10:40:35 -0700
committerMatei Zaharia <matei@databricks.com>2014-08-27 10:40:35 -0700
commit092121e477bcd2e474440dbdfdfa69cbd15c4803 (patch)
tree09290f15c27f22a76e89299369e9be17058ebff6 /python
parent8f8e2a4ee7419a96196727704695f5114da5b84e (diff)
downloadspark-092121e477bcd2e474440dbdfdfa69cbd15c4803.tar.gz
spark-092121e477bcd2e474440dbdfdfa69cbd15c4803.tar.bz2
spark-092121e477bcd2e474440dbdfdfa69cbd15c4803.zip
[SPARK-3239] [PySpark] randomize the dirs for each process
This can avoid the IO contention during spilling, when you have multiple disks. Author: Davies Liu <davies.liu@gmail.com> Closes #2152 from davies/randomize and squashes the following commits: a4863c4 [Davies Liu] randomize the dirs for each process
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/shuffle.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index 1ebe7df418..2750f117ba 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -21,6 +21,7 @@ import platform
import shutil
import warnings
import gc
+import random
from pyspark.serializers import BatchedSerializer, PickleSerializer
@@ -216,6 +217,9 @@ class ExternalMerger(Merger):
""" Get all the directories """
path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp")
dirs = path.split(",")
+ if len(dirs) > 1:
+ rnd = random.Random(os.getpid() + id(dirs))
+ random.shuffle(dirs, rnd.random)
return [os.path.join(d, "python", str(os.getpid()), str(id(self)))
for d in dirs]