aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@eecs.berkeley.edu>2013-02-03 06:44:49 +0000
committerJosh Rosen <joshrosen@eecs.berkeley.edu>2013-02-03 06:44:49 +0000
commit8fbd5380b7f36842297f624bad3a2513f7eca47b (patch)
treeacddfb8879427a107b33b7b72d8257a02e513ba5 /python/pyspark/rdd.py
parent2415c18f48fc28d88f29b88c312f98054f530f20 (diff)
downloadspark-8fbd5380b7f36842297f624bad3a2513f7eca47b.tar.gz
spark-8fbd5380b7f36842297f624bad3a2513f7eca47b.tar.bz2
spark-8fbd5380b7f36842297f624bad3a2513f7eca47b.zip
Fetch fewer objects in PySpark's take() method.
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index fb144bc45d..4cda6cf661 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -372,6 +372,10 @@ class RDD(object):
items = []
for partition in range(self._jrdd.splits().size()):
iterator = self.ctx._takePartition(self._jrdd.rdd(), partition)
+ # Each item in the iterator is a string, Python object, batch of
+ # Python objects. Regardless, it is sufficient to take `num`
+ # of these objects in order to collect `num` Python objects:
+ iterator = iterator.take(num)
items.extend(self._collect_iterator_through_file(iterator))
if len(items) >= num:
break