aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 4f74824ba4..07578b8d93 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1152,6 +1152,10 @@ class RDD(object):
Group the values for each key in the RDD into a single sequence.
Hash-partitions the resulting RDD with into numPartitions partitions.
+ Note: If you are grouping in order to perform an aggregation (such as a
+ sum or average) over each key, using reduceByKey will provide much better
+ performance.
+
>>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
>>> map((lambda (x,y): (x, list(y))), sorted(x.groupByKey().collect()))
[('a', [1, 1]), ('b', [1])]