aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/tests.py
diff options
context:
space:
mode:
authorDavies Liu <davies.liu@gmail.com>2014-09-18 18:11:48 -0700
committerJosh Rosen <joshrosen@apache.org>2014-09-18 18:11:48 -0700
commite77fa81a61798c89d5a9b6c9dc067d11785254b7 (patch)
tree2d84f29922e4523f223baff1c84573754c1cf0c7 /python/pyspark/tests.py
parent9306297d1d888d0430f79b2133ee7377871a3a18 (diff)
downloadspark-e77fa81a61798c89d5a9b6c9dc067d11785254b7.tar.gz
spark-e77fa81a61798c89d5a9b6c9dc067d11785254b7.tar.bz2
spark-e77fa81a61798c89d5a9b6c9dc067d11785254b7.zip
[SPARK-3554] [PySpark] use broadcast automatically for large closure
Py4j can not handle large string efficiently, so we should use broadcast for large closure automatically. (Broadcast use local filesystem to pass through data). Author: Davies Liu <davies.liu@gmail.com> Closes #2417 from davies/command and squashes the following commits: fbf4e97 [Davies Liu] bugfix aefd508 [Davies Liu] use broadcast automatically for large closure
Diffstat (limited to 'python/pyspark/tests.py')
-rw-r--r--python/pyspark/tests.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 0b3854347a..7301966e48 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -434,6 +434,12 @@ class TestRDDFunctions(PySparkTestCase):
m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum()
self.assertEquals(N, m)
+ def test_large_closure(self):
+ N = 1000000
+ data = [float(i) for i in xrange(N)]
+ m = self.sc.parallelize(range(1), 1).map(lambda x: len(data)).sum()
+ self.assertEquals(N, m)
+
def test_zip_with_different_serializers(self):
a = self.sc.parallelize(range(5))
b = self.sc.parallelize(range(100, 105))