From 49b6f456aca350e9e2c170782aa5cc75e7822680 Mon Sep 17 00:00:00 2001 From: anabranch Date: Thu, 17 Nov 2016 11:34:55 +0000 Subject: [SPARK-18365][DOCS] Improve Sample Method Documentation ## What changes were proposed in this pull request? I found the documentation for the sample method to be confusing, this adds more clarification across all languages. - [x] Scala - [x] Python - [x] R - [x] RDD Scala - [ ] RDD Python with SEED - [X] RDD Java - [x] RDD Java with SEED - [x] RDD Python ## How was this patch tested? NA Please review https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark before opening a pull request. Author: anabranch Author: Bill Chambers Closes #15815 from anabranch/SPARK-18365. --- R/pkg/R/DataFrame.R | 4 +++- core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala | 8 ++++++-- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 3 +++ python/pyspark/rdd.py | 5 +++++ python/pyspark/sql/dataframe.py | 5 +++++ sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 10 ++++++++-- 6 files changed, 30 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 1cf9b38ea6..4e3d97bb3a 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -936,7 +936,9 @@ setMethod("unique", #' Sample #' -#' Return a sampled subset of this SparkDataFrame using a random seed. +#' Return a sampled subset of this SparkDataFrame using a random seed. +#' Note: this is not guaranteed to provide exactly the fraction specified +#' of the total count of of the given SparkDataFrame. #' #' @param x A SparkDataFrame #' @param withReplacement Sampling with replacement or not diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala index 20d6c9341b..d67cff64e6 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala @@ -98,7 +98,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) def repartition(numPartitions: Int): JavaRDD[T] = rdd.repartition(numPartitions) /** - * Return a sampled subset of this RDD. + * Return a sampled subset of this RDD with a random seed. + * Note: this is NOT guaranteed to provide exactly the fraction of the count + * of the given [[RDD]]. * * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size @@ -109,7 +111,9 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T]) sample(withReplacement, fraction, Utils.random.nextLong) /** - * Return a sampled subset of this RDD. + * Return a sampled subset of this RDD, with a user-supplied seed. + * Note: this is NOT guaranteed to provide exactly the fraction of the count + * of the given [[RDD]]. * * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index e018af35cb..cded899db1 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -466,6 +466,9 @@ abstract class RDD[T: ClassTag]( /** * Return a sampled subset of this RDD. * + * Note: this is NOT guaranteed to provide exactly the fraction of the count + * of the given [[RDD]]. + * * @param withReplacement can elements be sampled multiple times (replaced when sampled out) * @param fraction expected size of the sample as a fraction of this RDD's size * without replacement: probability that each element is chosen; fraction must be [0, 1] diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 2de2c2fd1a..a163ceafe9 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -386,6 +386,11 @@ class RDD(object): with replacement: expected number of times each element is chosen; fraction must be >= 0 :param seed: seed for the random number generator + .. note:: + + This is not guaranteed to provide exactly the fraction specified of the total count + of the given :class:`DataFrame`. + >>> rdd = sc.parallelize(range(100), 4) >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14 True diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 29710acf54..3899890083 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -549,6 +549,11 @@ class DataFrame(object): def sample(self, withReplacement, fraction, seed=None): """Returns a sampled subset of this :class:`DataFrame`. + .. note:: + + This is not guaranteed to provide exactly the fraction specified of the total count + of the given :class:`DataFrame`. + >>> df.sample(False, 0.5, 42).count() 2 """ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index af30683cc0..3761773698 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1646,7 +1646,10 @@ class Dataset[T] private[sql]( } /** - * Returns a new Dataset by sampling a fraction of rows. + * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed. + * + * Note: this is NOT guaranteed to provide exactly the fraction of the count + * of the given [[Dataset]]. * * @param withReplacement Sample with replacement or not. * @param fraction Fraction of rows to generate. @@ -1665,7 +1668,10 @@ class Dataset[T] private[sql]( } /** - * Returns a new Dataset by sampling a fraction of rows, using a random seed. + * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed. + * + * Note: this is NOT guaranteed to provide exactly the fraction of the total count + * of the given [[Dataset]]. * * @param withReplacement Sample with replacement or not. * @param fraction Fraction of rows to generate. -- cgit v1.2.3