aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormbonaci <mbonaci@gmail.com>2015-03-20 18:30:45 +0000
committerSean Owen <sowen@cloudera.com>2015-03-20 18:33:53 +0000
commit28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060 (patch)
tree24985af7a3e26e1c852e3e9615d4eb188ff78f08
parentdb4d317ccfdd9bd1dc7e8beac54ebcc35966b7d5 (diff)
downloadspark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.tar.gz
spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.tar.bz2
spark-28bcb9e9e86a4b643fbf96b2b7e03928ebcfc060.zip
[SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample
The docs for the `sample` method were insufficient, now less so. Author: mbonaci <mbonaci@gmail.com> Closes #5097 from mbonaci/master and squashes the following commits: a6a9d97 [mbonaci] [SPARK-6370][core] Documentation: Improve all 3 docs for RDD.sample method
-rw-r--r--core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala11
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/RDD.scala6
-rw-r--r--python/pyspark/rdd.py6
3 files changed, 23 insertions, 0 deletions
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index 645dc3bfb6..3e9beb670f 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -101,12 +101,23 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
/**
* Return a sampled subset of this RDD.
+ *
+ * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
+ * @param fraction expected size of the sample as a fraction of this RDD's size
+ * without replacement: probability that each element is chosen; fraction must be [0, 1]
+ * with replacement: expected number of times each element is chosen; fraction must be >= 0
*/
def sample(withReplacement: Boolean, fraction: Double): JavaRDD[T] =
sample(withReplacement, fraction, Utils.random.nextLong)
/**
* Return a sampled subset of this RDD.
+ *
+ * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
+ * @param fraction expected size of the sample as a fraction of this RDD's size
+ * without replacement: probability that each element is chosen; fraction must be [0, 1]
+ * with replacement: expected number of times each element is chosen; fraction must be >= 0
+ * @param seed seed for the random number generator
*/
def sample(withReplacement: Boolean, fraction: Double, seed: Long): JavaRDD[T] =
wrapRDD(rdd.sample(withReplacement, fraction, seed))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index a139780d96..a4c74ed03e 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -377,6 +377,12 @@ abstract class RDD[T: ClassTag](
/**
* Return a sampled subset of this RDD.
+ *
+ * @param withReplacement can elements be sampled multiple times (replaced when sampled out)
+ * @param fraction expected size of the sample as a fraction of this RDD's size
+ * without replacement: probability that each element is chosen; fraction must be [0, 1]
+ * with replacement: expected number of times each element is chosen; fraction must be >= 0
+ * @param seed seed for the random number generator
*/
def sample(withReplacement: Boolean,
fraction: Double,
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index bf17f513c0..c337a43c8a 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -346,6 +346,12 @@ class RDD(object):
"""
Return a sampled subset of this RDD.
+ :param withReplacement: can elements be sampled multiple times (replaced when sampled out)
+ :param fraction: expected size of the sample as a fraction of this RDD's size
+ without replacement: probability that each element is chosen; fraction must be [0, 1]
+ with replacement: expected number of times each element is chosen; fraction must be >= 0
+ :param seed: seed for the random number generator
+
>>> rdd = sc.parallelize(range(100), 4)
>>> rdd.sample(False, 0.1, 81).count()
10