aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py12
1 files changed, 10 insertions, 2 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 70db4bbe4c..98a8ff8606 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -813,13 +813,21 @@ class RDD(object):
def fold(self, zeroValue, op):
"""
Aggregate the elements of each partition, and then the results for all
- the partitions, using a given associative function and a neutral "zero
- value."
+ the partitions, using a given associative and commutative function and
+ a neutral "zero value."
The function C{op(t1, t2)} is allowed to modify C{t1} and return it
as its result value to avoid object allocation; however, it should not
modify C{t2}.
+ This behaves somewhat differently from fold operations implemented
+ for non-distributed collections in functional languages like Scala.
+ This fold operation may be applied to partitions individually, and then
+ fold those results into the final result, rather than apply the fold
+ to each element sequentially in some defined ordering. For functions
+ that are not commutative, the result may differ from that of a fold
+ applied to a non-distributed collection.
+
>>> from operator import add
>>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)
15