diff options
Diffstat (limited to 'python/pyspark/rddsampler.py')
-rw-r--r-- | python/pyspark/rddsampler.py | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py index f5c3cfd259..558dcfd12d 100644 --- a/python/pyspark/rddsampler.py +++ b/python/pyspark/rddsampler.py @@ -115,6 +115,20 @@ class RDDSampler(RDDSamplerBase): yield obj +class RDDRangeSampler(RDDSamplerBase): + + def __init__(self, lowerBound, upperBound, seed=None): + RDDSamplerBase.__init__(self, False, seed) + self._use_numpy = False # no performance gain from numpy + self._lowerBound = lowerBound + self._upperBound = upperBound + + def func(self, split, iterator): + for obj in iterator: + if self._lowerBound <= self.getUniformSample(split) < self._upperBound: + yield obj + + class RDDStratifiedSampler(RDDSamplerBase): def __init__(self, withReplacement, fractions, seed=None): |