Merge pull request #907 from stephenh/document_coalesce_shuffle

Add better docs for coalesce.
author: Matei Zaharia <matei.zaharia@gmail.com> 2013-09-09 13:45:40 -0700
committer: Matei Zaharia <matei.zaharia@gmail.com> 2013-09-09 13:45:40 -0700
commit: a85758c20006537e7863cf30bc3c4b6b6efd4981 (patch)
tree: e0ddc6cac1625313a29f1cabb143ff5f6b90a945
parent: 084fc369614766ce9ca3285c784afceddd111d25 (diff)
parent: 59003d387d399d6280a42b4c79eef18cfdbfe72b (diff)
download: spark-a85758c20006537e7863cf30bc3c4b6b6efd4981.tar.gz
spark-a85758c20006537e7863cf30bc3c4b6b6efd4981.tar.bz2
spark-a85758c20006537e7863cf30bc3c4b6b6efd4981.zip
2 files changed, 27 insertions, 4 deletions
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index e143ecd096..1082cbae3e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -267,6 +267,23 @@ abstract class RDD[T: ClassManifest](
 
   /**
    * Return a new RDD that is reduced into `numPartitions` partitions.
+   *
+   * This results in a narrow dependency, e.g. if you go from 1000 partitions
+   * to 100 partitions, there will not be a shuffle, instead each of the 100
+   * new partitions will claim 10 of the current partitions.
+   *
+   * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+   * this may result in your computation taking place on fewer nodes than
+   * you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+   * you can pass shuffle = true. This will add a shuffle step, but means the
+   * current upstream partitions will be executed in parallel (per whatever
+   * the current partitioning is).
+   *
+   * Note: With shuffle = true, you can actually coalesce to a larger number
+   * of partitions. This is useful if you have a small number of partitions,
+   * say 100, potentially with a few partitions being abnormally large. Calling
+   * coalesce(1000, shuffle = true) will result in 1000 partitions with the
+   * data distributed using a hash partitioner.
    */
   def coalesce(numPartitions: Int, shuffle: Boolean = false): RDD[T] = {
     if (shuffle) {
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index adc971050e..c1df5e151e 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -140,7 +140,7 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(rdd.union(emptyKv).collect().size === 2)
   }
 
-  test("cogrouped RDDs") {
+  test("coalesced RDDs") {
     val data = sc.parallelize(1 to 10, 10)
 
     val coalesced1 = data.coalesce(2)
@@ -175,8 +175,14 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     val coalesced5 = data.coalesce(1, shuffle = true)
     assert(coalesced5.dependencies.head.rdd.dependencies.head.rdd.asInstanceOf[ShuffledRDD[_, _, _]] !=
       null)
+
+    // when shuffling, we can increase the number of partitions
+    val coalesced6 = data.coalesce(20, shuffle = true)
+    assert(coalesced6.partitions.size === 20)
+    assert(coalesced6.collect().toSet === (1 to 10).toSet)
   }
-  test("cogrouped RDDs with locality") {
+
+  test("coalesced RDDs with locality") {
     val data3 = sc.makeRDD(List((1,List("a","c")), (2,List("a","b","c")), (3,List("b"))))
     val coal3 = data3.coalesce(3)
     val list3 = coal3.partitions.map(p => p.asInstanceOf[CoalescedRDDPartition].preferredLocation)
@@ -197,11 +203,11 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     val coalesced4 = data.coalesce(20)
     val listOfLists = coalesced4.glom().collect().map(_.toList).toList
     val sortedList = listOfLists.sortWith{ (x, y) => !x.isEmpty && (y.isEmpty || (x(0) < y(0))) }
-    assert( sortedList === (1 to 9).
+    assert(sortedList === (1 to 9).
       map{x => List(x)}.toList, "Tried coalescing 9 partitions to 20 but didn't get 9 back")
   }
 
-  test("cogrouped RDDs with locality, large scale (10K partitions)") {
+  test("coalesced RDDs with locality, large scale (10K partitions)") {
     // large scale experiment
     import collection.mutable
     val rnd = scala.util.Random
author	Matei Zaharia <matei.zaharia@gmail.com>	2013-09-09 13:45:40 -0700
committer	Matei Zaharia <matei.zaharia@gmail.com>	2013-09-09 13:45:40 -0700
commit	a85758c20006537e7863cf30bc3c4b6b6efd4981 (patch)
tree	e0ddc6cac1625313a29f1cabb143ff5f6b90a945
parent	084fc369614766ce9ca3285c784afceddd111d25 (diff)
parent	59003d387d399d6280a42b4c79eef18cfdbfe72b (diff)
download	spark-a85758c20006537e7863cf30bc3c4b6b6efd4981.tar.gz spark-a85758c20006537e7863cf30bc3c4b6b6efd4981.tar.bz2 spark-a85758c20006537e7863cf30bc3c4b6b6efd4981.zip