aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--python/pyspark/rdd.py14
1 files changed, 14 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index e1043ad564..39916d21c7 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -946,7 +946,21 @@ class RDD(object):
combiners[k] = mergeCombiners(combiners[k], v)
return combiners.iteritems()
return shuffled.mapPartitions(_mergeCombiners)
+
+ def foldByKey(self, zeroValue, func, numPartitions=None):
+ """
+ Merge the values for each key using an associative function "func" and a neutral "zeroValue"
+ which may be added to the result an arbitrary number of times, and must not change
+ the result (e.g., 0 for addition, or 1 for multiplication.).
+ >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
+ >>> from operator import add
+ >>> rdd.foldByKey(0, add).collect()
+ [('a', 2), ('b', 1)]
+ """
+ return self.combineByKey(lambda v: func(zeroValue, v), func, func, numPartitions)
+
+
# TODO: support variant with custom partitioner
def groupByKey(self, numPartitions=None):
"""