From 21570b463388194877003318317aafd842800cac Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Wed, 14 May 2014 22:24:04 -0700 Subject: Documentation: Encourage use of reduceByKey instead of groupByKey. Author: Patrick Wendell Closes #784 from pwendell/group-by-key and squashes the following commits: 9b4505f [Patrick Wendell] Small fix 6347924 [Patrick Wendell] Documentation: Encourage use of reduceByKey instead of groupByKey. --- python/pyspark/rdd.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python/pyspark/rdd.py') diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 4f74824ba4..07578b8d93 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1152,6 +1152,10 @@ class RDD(object): Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with into numPartitions partitions. + Note: If you are grouping in order to perform an aggregation (such as a + sum or average) over each key, using reduceByKey will provide much better + performance. + >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> map((lambda (x,y): (x, list(y))), sorted(x.groupByKey().collect())) [('a', [1, 1]), ('b', [1])] -- cgit v1.2.3