document laziness of parallelize

Took me several hours to figure out this behavior. It would be good to highlight it in the documentation. Author: Ariel Rabkin <asrabkin@cs.princeton.edu> Closes #1070 from asrabkin/master and squashes the following commits: 29a076e [Ariel Rabkin] doc fix
author: Ariel Rabkin <asrabkin@cs.princeton.edu> 2014-06-12 17:51:33 -0700
committer: Reynold Xin <rxin@apache.org> 2014-06-12 17:51:33 -0700
commit: 0154587ab71d1b864f97497dbb38bc52b87675be (patch)
tree: 0c1254088b9c7f9c1aaec8c191986bdb4f080cbf /core
parent: a6e0afdcf0174425e8a6ff20b2bc2e3a7a374f19 (diff)
download: spark-0154587ab71d1b864f97497dbb38bc52b87675be.tar.gz
spark-0154587ab71d1b864f97497dbb38bc52b87675be.tar.bz2
spark-0154587ab71d1b864f97497dbb38bc52b87675be.zip
1 files changed, 11 insertions, 2 deletions
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 8fbda2c667..35970c2f50 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -434,12 +434,21 @@ class SparkContext(config: SparkConf) extends Logging {
 
   // Methods for creating RDDs
 
-  /** Distribute a local Scala collection to form an RDD. */
+  /** Distribute a local Scala collection to form an RDD.
+   *
+   * @note Parallelize acts lazily. If `seq` is a mutable collection and is
+   * altered after the call to parallelize and before the first action on the
+   * RDD, the resultant RDD will reflect the modified collection. Pass a copy of
+   * the argument to avoid this.
+   */
   def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
     new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
   }
 
-  /** Distribute a local Scala collection to form an RDD. */
+  /** Distribute a local Scala collection to form an RDD.
+   *
+   * This method is identical to `parallelize`.
+   */
   def makeRDD[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
     parallelize(seq, numSlices)
   }
author	Ariel Rabkin <asrabkin@cs.princeton.edu>	2014-06-12 17:51:33 -0700
committer	Reynold Xin <rxin@apache.org>	2014-06-12 17:51:33 -0700
commit	0154587ab71d1b864f97497dbb38bc52b87675be (patch)
tree	0c1254088b9c7f9c1aaec8c191986bdb4f080cbf /core
parent	a6e0afdcf0174425e8a6ff20b2bc2e3a7a374f19 (diff)
download	spark-0154587ab71d1b864f97497dbb38bc52b87675be.tar.gz spark-0154587ab71d1b864f97497dbb38bc52b87675be.tar.bz2 spark-0154587ab71d1b864f97497dbb38bc52b87675be.zip