aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/rdd.py14
1 files changed, 14 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index efd2f35912..014c0aa889 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2059,6 +2059,20 @@ class RDD(object):
hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF)
return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
+ def toLocalIterator(self):
+ """
+ Return an iterator that contains all of the elements in this RDD.
+ The iterator will consume as much memory as the largest partition in this RDD.
+ >>> rdd = sc.parallelize(range(10))
+ >>> [x for x in rdd.toLocalIterator()]
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+ """
+ partitions = xrange(self.getNumPartitions())
+ for partition in partitions:
+ rows = self.context.runJob(self, lambda x: x, [partition])
+ for row in rows:
+ yield row
+
class PipelinedRDD(RDD):