From 092121e477bcd2e474440dbdfdfa69cbd15c4803 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Wed, 27 Aug 2014 10:40:35 -0700 Subject: [SPARK-3239] [PySpark] randomize the dirs for each process This can avoid the IO contention during spilling, when you have multiple disks. Author: Davies Liu Closes #2152 from davies/randomize and squashes the following commits: a4863c4 [Davies Liu] randomize the dirs for each process --- python/pyspark/shuffle.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python') diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 1ebe7df418..2750f117ba 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -21,6 +21,7 @@ import platform import shutil import warnings import gc +import random from pyspark.serializers import BatchedSerializer, PickleSerializer @@ -216,6 +217,9 @@ class ExternalMerger(Merger): """ Get all the directories """ path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp") dirs = path.split(",") + if len(dirs) > 1: + rnd = random.Random(os.getpid() + id(dirs)) + random.shuffle(dirs, rnd.random) return [os.path.join(d, "python", str(os.getpid()), str(id(self))) for d in dirs] -- cgit v1.2.3