From 21570b463388194877003318317aafd842800cac Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Wed, 14 May 2014 22:24:04 -0700 Subject: Documentation: Encourage use of reduceByKey instead of groupByKey. Author: Patrick Wendell Closes #784 from pwendell/group-by-key and squashes the following commits: 9b4505f [Patrick Wendell] Small fix 6347924 [Patrick Wendell] Documentation: Encourage use of reduceByKey instead of groupByKey. --- .../main/scala/org/apache/spark/api/java/JavaPairRDD.scala | 12 ++++++++++++ .../main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 12 ++++++++++++ docs/scala-programming-guide.md | 4 ++++ python/pyspark/rdd.py | 4 ++++ 4 files changed, 32 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 554c065358..4c8f9ed6fb 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -263,6 +263,10 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Group the values for each key in the RDD into a single sequence. Allows controlling the * partitioning of the resulting key-value pair RDD by passing a Partitioner. + * + * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over + * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]] + * will provide much better performance. */ def groupByKey(partitioner: Partitioner): JavaPairRDD[K, JIterable[V]] = fromRDD(groupByResultToJava(rdd.groupByKey(partitioner))) @@ -270,6 +274,10 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Group the values for each key in the RDD into a single sequence. Hash-partitions the * resulting RDD with into `numPartitions` partitions. + * + * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over + * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]] + * will provide much better performance. */ def groupByKey(numPartitions: Int): JavaPairRDD[K, JIterable[V]] = fromRDD(groupByResultToJava(rdd.groupByKey(numPartitions))) @@ -380,6 +388,10 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) /** * Group the values for each key in the RDD into a single sequence. Hash-partitions the * resulting RDD with the existing partitioner/parallelism level. + * + * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over + * each key, using [[JavaPairRDD.reduceByKey]] or [[JavaPairRDD.combineByKey]] + * will provide much better performance. */ def groupByKey(): JavaPairRDD[K, JIterable[V]] = fromRDD(groupByResultToJava(rdd.groupByKey())) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index bc6d204434..223fef7926 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -264,6 +264,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) /** * Group the values for each key in the RDD into a single sequence. Allows controlling the * partitioning of the resulting key-value pair RDD by passing a Partitioner. + * + * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over + * each key, using [[PairRDDFunctions.reduceByKey]] or [[PairRDDFunctions.combineByKey]] + * will provide much better performance. */ def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])] = { // groupByKey shouldn't use map side combine because map side combine does not @@ -280,6 +284,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) /** * Group the values for each key in the RDD into a single sequence. Hash-partitions the * resulting RDD with into `numPartitions` partitions. + * + * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over + * each key, using [[PairRDDFunctions.reduceByKey]] or [[PairRDDFunctions.combineByKey]] + * will provide much better performance. */ def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])] = { groupByKey(new HashPartitioner(numPartitions)) @@ -365,6 +373,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) /** * Group the values for each key in the RDD into a single sequence. Hash-partitions the * resulting RDD with the existing partitioner/parallelism level. + * + * Note: If you are grouping in order to perform an aggregation (such as a sum or average) over + * each key, using [[PairRDDFunctions.reduceByKey]] or [[PairRDDFunctions.combineByKey]] + * will provide much better performance, */ def groupByKey(): RDD[(K, Iterable[V])] = { groupByKey(defaultPartitioner(self)) diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index 3ed86e460c..edaa7d0639 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -196,6 +196,10 @@ The following tables list the transformations and actions currently supported (s groupByKey([numTasks]) When called on a dataset of (K, V) pairs, returns a dataset of (K, Seq[V]) pairs.
+Note: If you are grouping in order to perform an aggregation (such as a sum or + average) over each key, using `reduceByKey` or `combineByKey` will yield much better + performance. +
Note: By default, if the RDD already has a partitioner, the task number is decided by the partition number of the partitioner, or else relies on the value of spark.default.parallelism if the property is set , otherwise depends on the partition number of the RDD. You can pass an optional numTasks argument to set a different number of tasks. diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 4f74824ba4..07578b8d93 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -1152,6 +1152,10 @@ class RDD(object): Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with into numPartitions partitions. + Note: If you are grouping in order to perform an aggregation (such as a + sum or average) over each key, using reduceByKey will provide much better + performance. + >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)]) >>> map((lambda (x,y): (x, list(y))), sorted(x.groupByKey().collect())) [('a', [1, 1]), ('b', [1])] -- cgit v1.2.3