aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/rdd.py5
-rw-r--r--python/pyspark/sql/dataframe.py5
2 files changed, 10 insertions, 0 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 2de2c2fd1a..a163ceafe9 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -386,6 +386,11 @@ class RDD(object):
with replacement: expected number of times each element is chosen; fraction must be >= 0
:param seed: seed for the random number generator
+ .. note::
+
+ This is not guaranteed to provide exactly the fraction specified of the total count
+ of the given :class:`DataFrame`.
+
>>> rdd = sc.parallelize(range(100), 4)
>>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14
True
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 29710acf54..3899890083 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -549,6 +549,11 @@ class DataFrame(object):
def sample(self, withReplacement, fraction, seed=None):
"""Returns a sampled subset of this :class:`DataFrame`.
+ .. note::
+
+ This is not guaranteed to provide exactly the fraction specified of the total count
+ of the given :class:`DataFrame`.
+
>>> df.sample(False, 0.5, 42).count()
2
"""