From 21570b463388194877003318317aafd842800cac Mon Sep 17 00:00:00 2001
From: Patrick Wendell <pwendell@gmail.com>
Date: Wed, 14 May 2014 22:24:04 -0700
Subject: Documentation: Encourage use of reduceByKey instead of groupByKey.

Author: Patrick Wendell <pwendell@gmail.com>

Closes #784 from pwendell/group-by-key and squashes the following commits:

9b4505f [Patrick Wendell] Small fix
6347924 [Patrick Wendell] Documentation: Encourage use of reduceByKey instead of groupByKey.
---
 python/pyspark/rdd.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'python/pyspark/rdd.py')

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 4f74824ba4..07578b8d93 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1152,6 +1152,10 @@ class RDD(object):
         Group the values for each key in the RDD into a single sequence.
         Hash-partitions the resulting RDD with into numPartitions partitions.
 
+        Note: If you are grouping in order to perform an aggregation (such as a
+        sum or average) over each key, using reduceByKey will provide much better
+        performance.
+
         >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
         >>> map((lambda (x,y): (x, list(y))), sorted(x.groupByKey().collect()))
         [('a', [1, 1]), ('b', [1])]
-- 
cgit v1.2.3