diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/rdd.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index f4cfe4845d..efd2f35912 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1634,8 +1634,8 @@ class RDD(object): Hash-partitions the resulting RDD with into numPartitions partitions. Note: If you are grouping in order to perform an aggregation (such as a - sum or average) over each key, using reduceByKey will provide much - better performance. + sum or average) over each key, using reduceByKey or aggregateByKey will + provide much better performance. >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> map((lambda (x,y): (x, list(y))), sorted(x.groupByKey().collect())) |