From 3c8650c12ad7a97852e7bd76153210493fd83e92 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 15 Jan 2015 11:40:41 -0800
Subject: [SPARK-5224] [PySpark] improve performance of parallelize
 list/ndarray

After the default batchSize changed to 0 (batched based on the size of object), but parallelize() still use BatchedSerializer with batchSize=1, this PR will use batchSize=1024 for parallelize by default.

Also, BatchedSerializer did not work well with list and numpy.ndarray, this improve BatchedSerializer by using __len__ and __getslice__.

Here is the benchmark for parallelize 1 millions int with list or ndarray:

    |          before     |   after  | improvements
 ------- | ------------ | ------------- | -------
list |   11.7 s  | 0.8 s |  14x
numpy.ndarray     |  32 s  |   0.7 s | 40x

Author: Davies Liu <davies@databricks.com>

Closes #4024 from davies/opt_numpy and squashes the following commits:

7618c7c [Davies Liu] improve performance of parallelize list/ndarray
---
 python/pyspark/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python/pyspark/context.py')

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 593d74bca5..64f6a3ca6b 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -319,7 +319,7 @@ class SparkContext(object):
         # Make sure we distribute data evenly if it's smaller than self.batchSize
         if "__len__" not in dir(c):
             c = list(c)    # Make it a list so we can compute its length
-        batchSize = max(1, min(len(c) // numSlices, self._batchSize))
+        batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))
         serializer = BatchedSerializer(self._unbatched_serializer, batchSize)
         serializer.dump_stream(c, tempFile)
         tempFile.close()
-- 
cgit v1.2.3