[SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample

The docs for the `sample` method were insufficient, now less so. Author: mbonaci <mbonaci@gmail.com> Closes #5097 from mbonaci/master and squashes the following commits: a6a9d97 [mbonaci] [SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample method
author: mbonaci <mbonaci@gmail.com> 2015-03-20 18:30:45 +0000
committer: Sean Owen <sowen@cloudera.com> 2015-03-20 18:33:53 +0000
commit: 28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060 (patch)
tree: 24985af7a3e26e1c852e3e9615d4eb188ff78f08
parent: db4d317ccfdd9bd1dc7e8beac54ebcc35966b7d5 (diff)
download: spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.tar.gz
spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.tar.bz2
spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.zip
3 files changed, 23 insertions, 0 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index 645dc3bfb6..3e9beb670f 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -101,12 +101,23 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
 
   /**
    * Return a sampled subset of this RDD.
+   * 
+   * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
+   * @param fraction expected size of the sample as a fraction of this RDD's size
+   *  without replacement: probability that each element is chosen; fraction must be [0, 1]
+   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
    */
   def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] =
     sample(withReplacement, fraction, Utils.random.nextLong)
     
   /**
    * Return a sampled subset of this RDD.
+   * 
+   * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
+   * @param fraction expected size of the sample as a fraction of this RDD's size
+   *  without replacement: probability that each element is chosen; fraction must be [0, 1]
+   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   * @param seed seed for the random number generator
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] =
     wrapRDD(rdd.sample(withReplacement, fraction, seed))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index a139780d96..a4c74ed03e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -377,6 +377,12 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Return a sampled subset of this RDD.
+   * 
+   * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
+   * @param fraction expected size of the sample as a fraction of this RDD's size
+   *  without replacement: probability that each element is chosen; fraction must be [0, 1]
+   *  with replacement: expected number of times each element is chosen; fraction must be >= 0
+   * @param seed seed for the random number generator
    */
   def sample(withReplacement: Boolean,
       fraction: Double,
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index bf17f513c0..c337a43c8a 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -346,6 +346,12 @@ class RDD(object):
         """
         Return a sampled subset of this RDD.
 
+        :param withReplacement: can elements be sampled multiple times (replaced when sampled out)
+        :param fraction: expected size of the sample as a fraction of this RDD's size
+            without replacement: probability that each element is chosen; fraction must be [0, 1]
+            with replacement: expected number of times each element is chosen; fraction must be >= 0
+        :param seed: seed for the random number generator
+
         >>> rdd = sc.parallelize(range(100), 4)
         >>> rdd.sample(False, 0.1, 81).count()
         10
author	mbonaci <mbonaci@gmail.com>	2015-03-20 18:30:45 +0000
committer	Sean Owen <sowen@cloudera.com>	2015-03-20 18:33:53 +0000
commit	28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060 (patch)
tree	24985af7a3e26e1c852e3e9615d4eb188ff78f08
parent	db4d317ccfdd9bd1dc7e8beac54ebcc35966b7d5 (diff)
download	spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.tar.gz spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.tar.bz2 spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.zip