diff options
author | Davies Liu <davies.liu@gmail.com> | 2014-08-27 10:40:35 -0700 |
---|---|---|
committer | Matei Zaharia <matei@databricks.com> | 2014-08-27 10:40:35 -0700 |
commit | 092121e477bcd2e474440dbdfdfa69cbd15c4803 (patch) | |
tree | 09290f15c27f22a76e89299369e9be17058ebff6 /python | |
parent | 8f8e2a4ee7419a96196727704695f5114da5b84e (diff) | |
download | spark-092121e477bcd2e474440dbdfdfa69cbd15c4803.tar.gz spark-092121e477bcd2e474440dbdfdfa69cbd15c4803.tar.bz2 spark-092121e477bcd2e474440dbdfdfa69cbd15c4803.zip |
[SPARK-3239] [PySpark] randomize the dirs for each process
This can avoid the IO contention during spilling, when you have multiple disks.
Author: Davies Liu <davies.liu@gmail.com>
Closes #2152 from davies/randomize and squashes the following commits:
a4863c4 [Davies Liu] randomize the dirs for each process
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/shuffle.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 1ebe7df418..2750f117ba 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -21,6 +21,7 @@ import platform import shutil import warnings import gc +import random from pyspark.serializers import BatchedSerializer, PickleSerializer @@ -216,6 +217,9 @@ class ExternalMerger(Merger): """ Get all the directories """ path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp") dirs = path.split(",") + if len(dirs) > 1: + rnd = random.Random(os.getpid() + id(dirs)) + random.shuffle(dirs, rnd.random) return [os.path.join(d, "python", str(os.getpid()), str(id(self))) for d in dirs] |