From ce92a9c18f033ac9fa2f12143fab00a90e0f4577 Mon Sep 17 00:00:00 2001
From: Sandy Ryza <sandy@cloudera.com>
Date: Thu, 12 Jun 2014 08:14:25 -0700
Subject: SPARK-554.  Add aggregateByKey.

Author: Sandy Ryza <sandy@cloudera.com>

Closes #705 from sryza/sandy-spark-554 and squashes the following commits:

2302b8f [Sandy Ryza] Add MIMA exclude
f52e0ad [Sandy Ryza] Fix Python tests for real
2f3afa3 [Sandy Ryza] Fix Python test
0b735e9 [Sandy Ryza] Fix line lengths
ae56746 [Sandy Ryza] Fix doc (replace T with V)
c2be415 [Sandy Ryza] Java and Python aggregateByKey
23bf400 [Sandy Ryza] SPARK-554.  Add aggregateByKey.
---
 python/pyspark/rdd.py   | 19 ++++++++++++++++++-
 python/pyspark/tests.py | 15 +++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 8a215fc511..735389c698 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1178,6 +1178,20 @@ class RDD(object):
                     combiners[k] = mergeCombiners(combiners[k], v)
             return combiners.iteritems()
         return shuffled.mapPartitions(_mergeCombiners)
+   
+    def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None):
+        """
+        Aggregate the values of each key, using given combine functions and a neutral "zero value".
+        This function can return a different result type, U, than the type of the values in this RDD,
+        V. Thus, we need one operation for merging a V into a U and one operation for merging two U's,
+        The former operation is used for merging values within a partition, and the latter is used
+        for merging values between partitions. To avoid memory allocation, both of these functions are
+        allowed to modify and return their first argument instead of creating a new U.
+        """
+        def createZero():
+          return copy.deepcopy(zeroValue)
+        
+        return self.combineByKey(lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions)
 
     def foldByKey(self, zeroValue, func, numPartitions=None):
         """
@@ -1190,7 +1204,10 @@ class RDD(object):
         >>> rdd.foldByKey(0, add).collect()
         [('a', 2), ('b', 1)]
         """
-        return self.combineByKey(lambda v: func(zeroValue, v), func, func, numPartitions)
+        def createZero():
+          return copy.deepcopy(zeroValue)
+
+        return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions)
 
 
     # TODO: support variant with custom partitioner
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 184ee810b8..c15bb45775 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -188,6 +188,21 @@ class TestRDDFunctions(PySparkTestCase):
         os.unlink(tempFile.name)
         self.assertRaises(Exception, lambda: filtered_data.count())
 
+    def testAggregateByKey(self):
+        data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)
+        def seqOp(x, y):
+            x.add(y)
+            return x
+
+        def combOp(x, y):
+            x |= y
+            return x
+          
+        sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect())
+        self.assertEqual(3, len(sets))
+        self.assertEqual(set([1]), sets[1])
+        self.assertEqual(set([2]), sets[3])
+        self.assertEqual(set([1, 3]), sets[5])
 
 class TestIO(PySparkTestCase):
 
-- 
cgit v1.2.3