aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoranabranch <wac.chambers@gmail.com>2016-11-17 11:34:55 +0000
committerSean Owen <sowen@cloudera.com>2016-11-17 11:34:55 +0000
commit49b6f456aca350e9e2c170782aa5cc75e7822680 (patch)
tree3a13f932b73feeab6b01f1d039728758203edcf0
parenta3cac7bd86a6fe8e9b42da1bf580aaeb59378304 (diff)
downloadspark-49b6f456aca350e9e2c170782aa5cc75e7822680.tar.gz
spark-49b6f456aca350e9e2c170782aa5cc75e7822680.tar.bz2
spark-49b6f456aca350e9e2c170782aa5cc75e7822680.zip
[SPARK-18365][DOCS] Improve Sample Method Documentation
## What changes were proposed in this pull request? I found the documentation for the sample method to be confusing, this adds more clarification across all languages. - [x] Scala - [x] Python - [x] R - [x] RDD Scala - [ ] RDD Python with SEED - [X] RDD Java - [x] RDD Java with SEED - [x] RDD Python ## How was this patch tested? NA Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: anabranch <wac.chambers@gmail.com> Author: Bill Chambers <bill@databricks.com> Closes #15815 from anabranch/SPARK-18365.
-rw-r--r--R/pkg/R/DataFrame.R4
-rw-r--r--core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala8
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/RDD.scala3
-rw-r--r--python/pyspark/rdd.py5
-rw-r--r--python/pyspark/sql/dataframe.py5
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala10
6 files changed, 30 insertions, 5 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1cf9b38ea6..4e3d97bb3a 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -936,7 +936,9 @@ setMethod("unique",
#' Sample
#'
-#' Return a sampled subset of this SparkDataFrame using a random seed.
+#' Return a sampled subset of this SparkDataFrame using a random seed.
+#' Note: this is not guaranteed to provide exactly the fraction specified
+#' of the total count of of the given SparkDataFrame.
#'
#' @param x A SparkDataFrame
#' @param withReplacement Sampling with replacement or not
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
index 20d6c9341b..d67cff64e6 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -98,7 +98,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
def repartition(numPartitions: Int): JavaRDD[T] = rdd.repartition(numPartitions)
/**
- * Return a sampled subset of this RDD.
+ * Return a sampled subset of this RDD with a random seed.
+ * Note: this is NOT guaranteed to provide exactly the fraction of the count
+ * of the given [[RDD]].
*
* @param withReplacement can elements be sampled multiple times (replaced when sampled out)
* @param fraction expected size of the sample as a fraction of this RDD's size
@@ -109,7 +111,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
sample(withReplacement, fraction, Utils.random.nextLong)
/**
- * Return a sampled subset of this RDD.
+ * Return a sampled subset of this RDD, with a user-supplied seed.
+ * Note: this is NOT guaranteed to provide exactly the fraction of the count
+ * of the given [[RDD]].
*
* @param withReplacement can elements be sampled multiple times (replaced when sampled out)
* @param fraction expected size of the sample as a fraction of this RDD's size
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index e018af35cb..cded899db1 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -466,6 +466,9 @@ abstract class RDD[T: ClassTag](
/**
* Return a sampled subset of this RDD.
*
+ * Note: this is NOT guaranteed to provide exactly the fraction of the count
+ * of the given [[RDD]].
+ *
* @param withReplacement can elements be sampled multiple times (replaced when sampled out)
* @param fraction expected size of the sample as a fraction of this RDD's size
* without replacement: probability that each element is chosen; fraction must be [0, 1]
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 2de2c2fd1a..a163ceafe9 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -386,6 +386,11 @@ class RDD(object):
with replacement: expected number of times each element is chosen; fraction must be >= 0
:param seed: seed for the random number generator
+ .. note::
+
+ This is not guaranteed to provide exactly the fraction specified of the total count
+ of the given :class:`DataFrame`.
+
>>> rdd = sc.parallelize(range(100), 4)
>>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14
True
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 29710acf54..3899890083 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -549,6 +549,11 @@ class DataFrame(object):
def sample(self, withReplacement, fraction, seed=None):
"""Returns a sampled subset of this :class:`DataFrame`.
+ .. note::
+
+ This is not guaranteed to provide exactly the fraction specified of the total count
+ of the given :class:`DataFrame`.
+
>>> df.sample(False, 0.5, 42).count()
2
"""
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index af30683cc0..3761773698 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1646,7 +1646,10 @@ class Dataset[T] private[sql](
}
/**
- * Returns a new Dataset by sampling a fraction of rows.
+ * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed.
+ *
+ * Note: this is NOT guaranteed to provide exactly the fraction of the count
+ * of the given [[Dataset]].
*
* @param withReplacement Sample with replacement or not.
* @param fraction Fraction of rows to generate.
@@ -1665,7 +1668,10 @@ class Dataset[T] private[sql](
}
/**
- * Returns a new Dataset by sampling a fraction of rows, using a random seed.
+ * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed.
+ *
+ * Note: this is NOT guaranteed to provide exactly the fraction of the total count
+ * of the given [[Dataset]].
*
* @param withReplacement Sample with replacement or not.
* @param fraction Fraction of rows to generate.