diff options
author | Patrick Wendell <pwendell@gmail.com> | 2013-10-24 10:52:25 -0700 |
---|---|---|
committer | Patrick Wendell <pwendell@gmail.com> | 2013-10-24 14:31:34 -0700 |
commit | 2fda84fe3ffa4d887e58b4d1717765a42a30b0f9 (patch) | |
tree | ed7b401310a80ba7168c7b5a371d2f35d30d5f1f | |
parent | 08c1a42d7d9edef02a24a3bc5045b2dce035a93b (diff) | |
download | spark-2fda84fe3ffa4d887e58b4d1717765a42a30b0f9.tar.gz spark-2fda84fe3ffa4d887e58b4d1717765a42a30b0f9.tar.bz2 spark-2fda84fe3ffa4d887e58b4d1717765a42a30b0f9.zip |
Always use a shuffle
-rw-r--r-- | core/src/main/scala/org/apache/spark/rdd/RDD.scala | 22 |
1 files changed, 7 insertions, 15 deletions
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index e2652f13c4..17bc2515f2 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -268,22 +268,14 @@ abstract class RDD[T: ClassManifest]( /** * Return a new RDD that has exactly numPartitions partitions. * - * Used to increase or decrease the level of parallelism in this RDD. By default, this will use - * a shuffle to redistribute data. If you are shrinking the RDD into fewer partitions, you can - * set skipShuffle = false to avoid a shuffle. Skipping shuffles is not supported when - * increasing the number of partitions. + * Used to increase or decrease the level of parallelism in this RDD. This will use + * a shuffle to redistribute data. * - * Similar to `coalesce`, but shuffles by default, allowing you to call this safely even - * if you don't know the number of partitions. - */ - def repartition(numPartitions: Int, skipShuffle: Boolean = false): RDD[T] = { - if (skipShuffle && numPartitions > this.partitions.size) { - val msg = "repartition must grow %s from %s to %s partitions, cannot skip shuffle.".format( - this.name, this.partitions.size, numPartitions - ) - throw new IllegalArgumentException(msg) - } - coalesce(numPartitions, !skipShuffle) + * If you are decreasing the number of partitions in this RDD, consider using `coalesce`, + * which can avoid performing a shuffle. + */ + def repartition(numPartitions: Int): RDD[T] = { + coalesce(numPartitions, true) } /** |