diff options
Diffstat (limited to 'core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala')
-rw-r--r-- | core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala index 791491daf0..7eb2f56c20 100644 --- a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala @@ -129,6 +129,13 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { t(m / 2) } + def replacementSampling(data: Iterator[Int], sampler: PoissonSampler[Int]): Iterator[Int] = { + data.flatMap { item => + val count = sampler.sample() + if (count == 0) Iterator.empty else Iterator.fill(count)(item) + } + } + test("utilities") { val s1 = Array(0, 1, 1, 0, 2) val s2 = Array(1, 0, 3, 2, 1) @@ -189,6 +196,36 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { d should be > D } + test("bernoulli sampling without iterator") { + // Tests expect maximum gap sampling fraction to be this value + RandomSampler.defaultMaxGapSamplingFraction should be (0.4) + + var d: Double = 0.0 + + val data = Iterator.from(0) + + var sampler: RandomSampler[Int, Int] = new BernoulliSampler[Int](0.5) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.5))) + d should be < D + + sampler = new BernoulliSampler[Int](0.7) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.7))) + d should be < D + + sampler = new BernoulliSampler[Int](0.9) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.9))) + d should be < D + + // sampling at different frequencies should show up as statistically different: + sampler = new BernoulliSampler[Int](0.5) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.6))) + d should be > D + } + test("bernoulli sampling with gap sampling optimization") { // Tests expect maximum gap sampling fraction to be this value RandomSampler.defaultMaxGapSamplingFraction should be (0.4) @@ -217,6 +254,37 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { d should be > D } + test("bernoulli sampling (without iterator) with gap sampling optimization") { + // Tests expect maximum gap sampling fraction to be this value + RandomSampler.defaultMaxGapSamplingFraction should be (0.4) + + var d: Double = 0.0 + + val data = Iterator.from(0) + + var sampler: RandomSampler[Int, Int] = new BernoulliSampler[Int](0.01) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), + gaps(sample(Iterator.from(0), 0.01))) + d should be < D + + sampler = new BernoulliSampler[Int](0.1) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.1))) + d should be < D + + sampler = new BernoulliSampler[Int](0.3) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.3))) + d should be < D + + // sampling at different frequencies should show up as statistically different: + sampler = new BernoulliSampler[Int](0.3) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.4))) + d should be > D + } + test("bernoulli boundary cases") { val data = (1 to 100).toArray @@ -233,6 +301,22 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { sampler.sample(data.iterator).toArray should be (data) } + test("bernoulli (without iterator) boundary cases") { + val data = (1 to 100).toArray + + var sampler = new BernoulliSampler[Int](0.0) + data.filter(_ => sampler.sample() > 0) should be (Array.empty[Int]) + + sampler = new BernoulliSampler[Int](1.0) + data.filter(_ => sampler.sample() > 0) should be (data) + + sampler = new BernoulliSampler[Int](0.0 - (RandomSampler.roundingEpsilon / 2.0)) + data.filter(_ => sampler.sample() > 0) should be (Array.empty[Int]) + + sampler = new BernoulliSampler[Int](1.0 + (RandomSampler.roundingEpsilon / 2.0)) + data.filter(_ => sampler.sample() > 0) should be (data) + } + test("bernoulli data types") { // Tests expect maximum gap sampling fraction to be this value RandomSampler.defaultMaxGapSamplingFraction should be (0.4) @@ -341,6 +425,36 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { d should be > D } + test("replacement sampling without iterator") { + // Tests expect maximum gap sampling fraction to be this value + RandomSampler.defaultMaxGapSamplingFraction should be (0.4) + + var d: Double = 0.0 + + val data = Iterator.from(0) + + var sampler = new PoissonSampler[Int](0.5) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.5))) + d should be < D + + sampler = new PoissonSampler[Int](0.7) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.7))) + d should be < D + + sampler = new PoissonSampler[Int](0.9) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.9))) + d should be < D + + // sampling at different frequencies should show up as statistically different: + sampler = new PoissonSampler[Int](0.5) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.6))) + d should be > D + } + test("replacement sampling with gap sampling") { // Tests expect maximum gap sampling fraction to be this value RandomSampler.defaultMaxGapSamplingFraction should be (0.4) @@ -369,6 +483,36 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { d should be > D } + test("replacement sampling (without iterator) with gap sampling") { + // Tests expect maximum gap sampling fraction to be this value + RandomSampler.defaultMaxGapSamplingFraction should be (0.4) + + var d: Double = 0.0 + + val data = Iterator.from(0) + + var sampler = new PoissonSampler[Int](0.01) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.01))) + d should be < D + + sampler = new PoissonSampler[Int](0.1) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.1))) + d should be < D + + sampler = new PoissonSampler[Int](0.3) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.3))) + d should be < D + + // sampling at different frequencies should show up as statistically different: + sampler = new PoissonSampler[Int](0.3) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(replacementSampling(data, sampler)), gaps(sampleWR(Iterator.from(0), 0.4))) + d should be > D + } + test("replacement boundary cases") { val data = (1 to 100).toArray @@ -383,6 +527,20 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { sampler.sample(data.iterator).length should be > (data.length) } + test("replacement (without) boundary cases") { + val data = (1 to 100).toArray + + var sampler = new PoissonSampler[Int](0.0) + replacementSampling(data.iterator, sampler).toArray should be (Array.empty[Int]) + + sampler = new PoissonSampler[Int](0.0 - (RandomSampler.roundingEpsilon / 2.0)) + replacementSampling(data.iterator, sampler).toArray should be (Array.empty[Int]) + + // sampling with replacement has no upper bound on sampling fraction + sampler = new PoissonSampler[Int](2.0) + replacementSampling(data.iterator, sampler).length should be > (data.length) + } + test("replacement data types") { // Tests expect maximum gap sampling fraction to be this value RandomSampler.defaultMaxGapSamplingFraction should be (0.4) @@ -477,6 +635,22 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { d should be < D } + test("bernoulli partitioning sampling without iterator") { + var d: Double = 0.0 + + val data = Iterator.from(0) + + var sampler = new BernoulliCellSampler[Int](0.1, 0.2) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.1))) + d should be < D + + sampler = new BernoulliCellSampler[Int](0.1, 0.2, true) + sampler.setSeed(rngSeed.nextLong) + d = medianKSD(gaps(data.filter(_ => sampler.sample() > 0)), gaps(sample(Iterator.from(0), 0.9))) + d should be < D + } + test("bernoulli partitioning boundary cases") { val data = (1 to 100).toArray val d = RandomSampler.roundingEpsilon / 2.0 @@ -500,6 +674,29 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { sampler.sample(data.iterator).toArray should be (Array.empty[Int]) } + test("bernoulli partitioning (without iterator) boundary cases") { + val data = (1 to 100).toArray + val d = RandomSampler.roundingEpsilon / 2.0 + + var sampler = new BernoulliCellSampler[Int](0.0, 0.0) + data.filter(_ => sampler.sample() > 0).toArray should be (Array.empty[Int]) + + sampler = new BernoulliCellSampler[Int](0.5, 0.5) + data.filter(_ => sampler.sample() > 0).toArray should be (Array.empty[Int]) + + sampler = new BernoulliCellSampler[Int](1.0, 1.0) + data.filter(_ => sampler.sample() > 0).toArray should be (Array.empty[Int]) + + sampler = new BernoulliCellSampler[Int](0.0, 1.0) + data.filter(_ => sampler.sample() > 0).toArray should be (data) + + sampler = new BernoulliCellSampler[Int](0.0 - d, 1.0 + d) + data.filter(_ => sampler.sample() > 0).toArray should be (data) + + sampler = new BernoulliCellSampler[Int](0.5, 0.5 - d) + data.filter(_ => sampler.sample() > 0).toArray should be (Array.empty[Int]) + } + test("bernoulli partitioning data") { val seed = rngSeed.nextLong val data = (1 to 100).toArray |