aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
authorMatei Zaharia <matei@eecs.berkeley.edu>2013-07-28 23:38:56 -0400
committerMatei Zaharia <matei@eecs.berkeley.edu>2013-07-29 02:51:43 -0400
commitb5ec35562210c8e7ca4fea07a0d46cb255988c0d (patch)
tree5ff793af32f83fb259508355e536f7caa2fbe7c0 /python/pyspark/rdd.py
parentb9d6783f36d527f5082bf13a4ee6fd108e97795c (diff)
downloadspark-b5ec35562210c8e7ca4fea07a0d46cb255988c0d.tar.gz
spark-b5ec35562210c8e7ca4fea07a0d46cb255988c0d.tar.bz2
spark-b5ec35562210c8e7ca4fea07a0d46cb255988c0d.zip
Optimize Python foreach() to not return as many objects
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 6efa61aa66..4aafe35d13 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -267,7 +267,11 @@ class RDD(object):
>>> def f(x): print x
>>> sc.parallelize([1, 2, 3, 4, 5]).foreach(f)
"""
- self.map(f).collect() # Force evaluation
+ def processPartition(iterator):
+ for x in iterator:
+ f(x)
+ yield None
+ self.mapPartitions(processPartition).collect() # Force evaluation
def collect(self):
"""