aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--python/pyspark/rddsampler.py4
-rw-r--r--python/pyspark/tests.py6
2 files changed, 8 insertions, 2 deletions
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
index 55e247da0e..528a181e89 100644
--- a/python/pyspark/rddsampler.py
+++ b/python/pyspark/rddsampler.py
@@ -31,7 +31,7 @@ class RDDSamplerBase(object):
"Falling back to default random generator for sampling.")
self._use_numpy = False
- self._seed = seed if seed is not None else random.randint(0, sys.maxint)
+ self._seed = seed if seed is not None else random.randint(0, 2 ** 32 - 1)
self._withReplacement = withReplacement
self._random = None
self._split = None
@@ -47,7 +47,7 @@ class RDDSamplerBase(object):
for _ in range(0, split):
# discard the next few values in the sequence to have a
# different seed for the different splits
- self._random.randint(0, sys.maxint)
+ self._random.randint(0, 2 ** 32 - 1)
self._split = split
self._rand_initialized = True
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f5ccf31abb..1a8e4150e6 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -433,6 +433,12 @@ class RDDTests(ReusedPySparkTestCase):
os.unlink(tempFile.name)
self.assertRaises(Exception, lambda: filtered_data.count())
+ def test_sampling_default_seed(self):
+ # Test for SPARK-3995 (default seed setting)
+ data = self.sc.parallelize(range(1000), 1)
+ subset = data.takeSample(False, 10)
+ self.assertEqual(len(subset), 10)
+
def testAggregateByKey(self):
data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2)