core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.util.random

import java.util.Random

import scala.collection.mutable.ArrayBuffer

import org.apache.commons.math3.distribution.PoissonDistribution
import org.scalatest.Matchers

import org.apache.spark.SparkFunSuite

class RandomSamplerSuite extends SparkFunSuite with Matchers {
  /**
   * My statistical testing methodology is to run a Kolmogorov-Smirnov (KS) test
   * between the random samplers and simple reference samplers (known to work correctly).
   * The sampling gap sizes between chosen samples should show up as having the same
   * distributions between test and reference, if things are working properly.  That is,
   * the KS test will fail to strongly reject the null hypothesis that the distributions of
   * sampling gaps are the same.
   * There are no actual KS tests implemented for scala (that I can find) - and so what I
   * have done here is pre-compute "D" - the KS statistic - that corresponds to a "weak"
   * p-value for a particular sample size.  I can then test that my measured KS stats
   * are less than D.  Computing D-values is easy, and implemented below.
   *
   * I used the scipy 'kstwobign' distribution to pre-compute my D value:
   *
   * def ksdval(q=0.1, n=1000):
   *     en = np.sqrt(float(n) / 2.0)
   *     return stats.kstwobign.isf(float(q)) / (en + 0.12 + 0.11 / en)
   *
   * When comparing KS stats I take the median of a small number of independent test runs
   * to compensate for the issue that any sampled statistic will show "false positive" with
   * some probability.  Even when two distributions are the same, they will register as
   * different 10% of the time at a p-value of 0.1
   */

  // This D value is the precomputed KS statistic for p-value 0.1, sample size 1000:
  val sampleSize = 1000
  val D = 0.0544280747619

  // I'm not a big fan of fixing seeds, but unit testing based on running statistical tests
  // will always fail with some nonzero probability, so I'll fix the seed to prevent these
  // tests from generating random failure noise in CI testing, etc.
  val rngSeed: Random = RandomSampler.newDefaultRNG
  rngSeed.setSeed(235711)

  // Reference implementation of sampling without replacement (bernoulli)
  def sample[T](data: Iterator[T], f: Double): Iterator[T] = {
    val rng: Random = RandomSampler.newDefaultRNG
    rng.setSeed(rngSeed.nextLong)
    data.filter(_ => (rng.nextDouble <= f))
  }

  // Reference implementation of sampling with replacement
  def sampleWR[T](data: Iterator[T], f: Double): Iterator[T] = {
    val rng = new PoissonDistribution(f)
    rng.reseedRandomGenerator(rngSeed.nextLong)
    data.flatMap { v => {
      val rep = rng.sample()
      if (rep == 0) Iterator.empty else Iterator.fill(rep)(v)
    }}
  }

  // Returns iterator over gap lengths between samples.
  // This function assumes input data is integers sampled from the sequence of
  // increasing integers: {0, 1, 2, ...}.  This works because that is how I generate them,
  // and the samplers preserve their input order
  def gaps(data: Iterator[Int]): Iterator[Int] = {
    data.sliding(2).withPartial(false).map { x => x(1) - x(0) }
  }

  // Returns the cumulative distribution from a histogram
  def cumulativeDist(hist: Array[Int]): Array[Double] = {
    val n = hist.sum.toDouble
    assert(n > 0.0)
    hist.scanLeft(0)(_ + _).drop(1).map { _.toDouble / n }
  }

  // Returns aligned cumulative distributions from two arrays of data
  def cumulants(d1: Array[Int], d2: Array[Int],
      ss: Int = sampleSize): (Array[Double], Array[Double]) = {
    assert(math.min(d1.length, d2.length) > 0)
    assert(math.min(d1.min, d2.min)  >=  0)
    val m = 1 + math.max(d1.max, d2.max)
    val h1 = Array.fill[Int](m)(0)
    val h2 = Array.fill[Int](m)(0)
    for (v <- d1) { h1(v) += 1 }
    for (v <- d2) { h2(v) += 1 }
    assert(h1.sum == h2.sum)
    assert(h1.sum == ss)
    (cumulativeDist(h1), cumulativeDist(h2))
  }

  // Computes the Kolmogorov-Smirnov 'D' statistic from two cumulative distributions
  def KSD(cdf1: Array[Double], cdf2: Array[Double]): Double = {
    assert(cdf1.length == cdf2.length)
    val n = cdf1.length
    assert(n > 0)
    assert(cdf1(n-1) == 1.0)
    assert(cdf2(n-1) == 1.0)
    cdf1.zip(cdf2).map { x => Math.abs(x._1 - x._2) }.max
  }

  // Returns the median KS 'D' statistic between two samples, over (m) sampling trials
  def medianKSD(data1: => Iterator[Int], data2: => Iterator[Int], m: Int = 5): Double = {
    val t = Array.fill[Double](m) {
      val (c1, c2) = cumulants(data1.take(sampleSize).toArray,
                               data2.take(sampleSize).toArray)
      KSD(c1, c2)
    }.sorted
    // return the median KS statistic
    t(m / 2)
  }

  test("utilities") {
    val s1 = Array(0, 1, 1, 0, 2)
    val s2 = Array(1, 0, 3, 2, 1)
    val (c1, c2) = cumulants(s1, s2, ss = 5)
    c1 should be (Array(0.4, 0.8, 1.0, 1.0))
    c2 should be (Array(0.2, 0.6, 0.8, 1.0))
    KSD(c1, c2) should be (0.2 +- 0.000001)
    KSD(c2, c1) should be (KSD(c1, c2))
    gaps(List(0, 1, 1, 2, 4, 11).iterator).toArray should be (Array(1, 0, 1, 2, 7))
  }

  test("sanity check medianKSD against references") {
    var d: Double = 0.0

    // should be statistically same, i.e. fail to reject null hypothesis strongly
    d = medianKSD(gaps(sample(Iterator.from(0), 0.5)), gaps(sample(Iterator.from(0), 0.5)))
    d should be < D

    // should be statistically different - null hypothesis will have high D value,
    // corresponding to low p-value that rejects the null hypothesis
    d = medianKSD(gaps(sample(Iterator.from(0), 0.4)), gaps(sample(Iterator.from(0), 0.5)))
    d should be > D

    // same!
    d = medianKSD(gaps(sampleWR(Iterator.from(0), 0.5)), gaps(sampleWR(Iterator.from(0), 0.5)))
    d should be < D

    // different!
    d = medianKSD(gaps(sampleWR(Iterator.from(0), 0.5)), gaps(sampleWR(Iterator.from(0), 0.6)))
    d should be > D
  }

  test("bernoulli sampling") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0

    var sampler: RandomSampler[Int, Int] = new BernoulliSampler[Int](0.5)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.5)))
    d should be < D

    sampler = new BernoulliSampler[Int](0.7)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.7)))
    d should be < D

    sampler = new BernoulliSampler[Int](0.9)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.9)))
    d should be < D

    // sampling at different frequencies should show up as statistically different:
    sampler = new BernoulliSampler[Int](0.5)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.6)))
    d should be > D
  }

  test("bernoulli sampling with gap sampling optimization") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0

    var sampler: RandomSampler[Int, Int] = new BernoulliSampler[Int](0.01)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.01)))
    d should be < D

    sampler = new BernoulliSampler[Int](0.1)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.1)))
    d should be < D

    sampler = new BernoulliSampler[Int](0.3)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.3)))
    d should be < D

    // sampling at different frequencies should show up as statistically different:
    sampler = new BernoulliSampler[Int](0.3)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.4)))
    d should be > D
  }

  test("bernoulli boundary cases") {
    val data = (1 to 100).toArray

    var sampler = new BernoulliSampler[Int](0.0)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new BernoulliSampler[Int](1.0)
    sampler.sample(data.iterator).toArray should be (data)

    sampler = new BernoulliSampler[Int](0.0 - (RandomSampler.roundingEpsilon / 2.0))
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new BernoulliSampler[Int](1.0 + (RandomSampler.roundingEpsilon / 2.0))
    sampler.sample(data.iterator).toArray should be (data)
  }

  test("bernoulli data types") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0
    var sampler = new BernoulliSampler[Int](0.1)
    sampler.setSeed(rngSeed.nextLong)

    // Array iterator (indexable type)
    d = medianKSD(
      gaps(sampler.sample(Iterator.from(0).take(20*sampleSize).toArray.iterator)),
      gaps(sample(Iterator.from(0), 0.1)))
    d should be < D

    // ArrayBuffer iterator (indexable type)
    d = medianKSD(
      gaps(sampler.sample(Iterator.from(0).take(20*sampleSize).to[ArrayBuffer].iterator)),
      gaps(sample(Iterator.from(0), 0.1)))
    d should be < D

    // List iterator (non-indexable type)
    d = medianKSD(
      gaps(sampler.sample(Iterator.from(0).take(20*sampleSize).toList.iterator)),
      gaps(sample(Iterator.from(0), 0.1)))
    d should be < D
  }

  test("bernoulli clone") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d = 0.0
    var sampler = new BernoulliSampler[Int](0.1).clone
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.1)))
    d should be < D

    sampler = new BernoulliSampler[Int](0.9).clone
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.9)))
    d should be < D
  }

  test("bernoulli set seed") {
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0
    var sampler1 = new BernoulliSampler[Int](0.2)
    var sampler2 = new BernoulliSampler[Int](0.2)

    // distributions should be identical if seeds are set same
    sampler1.setSeed(73)
    sampler2.setSeed(73)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be (0.0)

    // should be different for different seeds
    sampler1.setSeed(73)
    sampler2.setSeed(37)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be > 0.0
    d should be < D

    sampler1 = new BernoulliSampler[Int](0.8)
    sampler2 = new BernoulliSampler[Int](0.8)

    // distributions should be identical if seeds are set same
    sampler1.setSeed(73)
    sampler2.setSeed(73)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be (0.0)

    // should be different for different seeds
    sampler1.setSeed(73)
    sampler2.setSeed(37)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be > 0.0
    d should be < D
  }

  test("replacement sampling") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0

    var sampler = new PoissonSampler[Int](0.5)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.5)))
    d should be < D

    sampler = new PoissonSampler[Int](0.7)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.7)))
    d should be < D

    sampler = new PoissonSampler[Int](0.9)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.9)))
    d should be < D

    // sampling at different frequencies should show up as statistically different:
    sampler = new PoissonSampler[Int](0.5)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.6)))
    d should be > D
  }

  test("replacement sampling with gap sampling") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0

    var sampler = new PoissonSampler[Int](0.01)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.01)))
    d should be < D

    sampler = new PoissonSampler[Int](0.1)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.1)))
    d should be < D

    sampler = new PoissonSampler[Int](0.3)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.3)))
    d should be < D

    // sampling at different frequencies should show up as statistically different:
    sampler = new PoissonSampler[Int](0.3)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.4)))
    d should be > D
  }

  test("replacement boundary cases") {
    val data = (1 to 100).toArray

    var sampler = new PoissonSampler[Int](0.0)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new PoissonSampler[Int](0.0 - (RandomSampler.roundingEpsilon / 2.0))
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    // sampling with replacement has no upper bound on sampling fraction
    sampler = new PoissonSampler[Int](2.0)
    sampler.sample(data.iterator).length should be > (data.length)
  }

  test("replacement data types") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0
    var sampler = new PoissonSampler[Int](0.1)
    sampler.setSeed(rngSeed.nextLong)

    // Array iterator (indexable type)
    d = medianKSD(
      gaps(sampler.sample(Iterator.from(0).take(20*sampleSize).toArray.iterator)),
      gaps(sampleWR(Iterator.from(0), 0.1)))
    d should be < D

    // ArrayBuffer iterator (indexable type)
    d = medianKSD(
      gaps(sampler.sample(Iterator.from(0).take(20*sampleSize).to[ArrayBuffer].iterator)),
      gaps(sampleWR(Iterator.from(0), 0.1)))
    d should be < D

    // List iterator (non-indexable type)
    d = medianKSD(
      gaps(sampler.sample(Iterator.from(0).take(20*sampleSize).toList.iterator)),
      gaps(sampleWR(Iterator.from(0), 0.1)))
    d should be < D
  }

  test("replacement clone") {
    // Tests expect maximum gap sampling fraction to be this value
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d = 0.0
    var sampler = new PoissonSampler[Int](0.1).clone
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.1)))
    d should be < D

    sampler = new PoissonSampler[Int](0.9).clone
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sampleWR(Iterator.from(0), 0.9)))
    d should be < D
  }

  test("replacement set seed") {
    RandomSampler.defaultMaxGapSamplingFraction should be (0.4)

    var d: Double = 0.0
    var sampler1 = new PoissonSampler[Int](0.2)
    var sampler2 = new PoissonSampler[Int](0.2)

    // distributions should be identical if seeds are set same
    sampler1.setSeed(73)
    sampler2.setSeed(73)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be (0.0)

    // should be different for different seeds
    sampler1.setSeed(73)
    sampler2.setSeed(37)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be > 0.0
    d should be < D

    sampler1 = new PoissonSampler[Int](0.8)
    sampler2 = new PoissonSampler[Int](0.8)

    // distributions should be identical if seeds are set same
    sampler1.setSeed(73)
    sampler2.setSeed(73)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be (0.0)

    // should be different for different seeds
    sampler1.setSeed(73)
    sampler2.setSeed(37)
    d = medianKSD(gaps(sampler1.sample(Iterator.from(0))), gaps(sampler2.sample(Iterator.from(0))))
    d should be > 0.0
    d should be < D
  }

  test("bernoulli partitioning sampling") {
    var d: Double = 0.0

    var sampler = new BernoulliCellSampler[Int](0.1, 0.2)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.1)))
    d should be < D

    sampler = new BernoulliCellSampler[Int](0.1, 0.2, true)
    sampler.setSeed(rngSeed.nextLong)
    d = medianKSD(gaps(sampler.sample(Iterator.from(0))), gaps(sample(Iterator.from(0), 0.9)))
    d should be < D
  }

  test("bernoulli partitioning boundary cases") {
    val data = (1 to 100).toArray
    val d = RandomSampler.roundingEpsilon / 2.0

    var sampler = new BernoulliCellSampler[Int](0.0, 0.0)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new BernoulliCellSampler[Int](0.5, 0.5)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new BernoulliCellSampler[Int](1.0, 1.0)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new BernoulliCellSampler[Int](0.0, 1.0)
    sampler.sample(data.iterator).toArray should be (data)

    sampler = new BernoulliCellSampler[Int](0.0 - d, 1.0 + d)
    sampler.sample(data.iterator).toArray should be (data)

    sampler = new BernoulliCellSampler[Int](0.5, 0.5 - d)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])
  }

  test("bernoulli partitioning data") {
    val seed = rngSeed.nextLong
    val data = (1 to 100).toArray

    var sampler = new BernoulliCellSampler[Int](0.4, 0.6)
    sampler.setSeed(seed)
    val s1 = sampler.sample(data.iterator).toArray
    s1.length should be > 0

    sampler = new BernoulliCellSampler[Int](0.4, 0.6, true)
    sampler.setSeed(seed)
    val s2 = sampler.sample(data.iterator).toArray
    s2.length should be > 0

    (s1 ++ s2).sorted should be (data)

    sampler = new BernoulliCellSampler[Int](0.5, 0.5)
    sampler.sample(data.iterator).toArray should be (Array.empty[Int])

    sampler = new BernoulliCellSampler[Int](0.5, 0.5, true)
    sampler.sample(data.iterator).toArray should be (data)
  }

  test("bernoulli partitioning clone") {
    val seed = rngSeed.nextLong
    val data = (1 to 100).toArray
    val base = new BernoulliCellSampler[Int](0.35, 0.65)

    var sampler = base.clone
    sampler.setSeed(seed)
    val s1 = sampler.sample(data.iterator).toArray
    s1.length should be > 0

    sampler = base.cloneComplement
    sampler.setSeed(seed)
    val s2 = sampler.sample(data.iterator).toArray
    s2.length should be > 0

    (s1 ++ s2).sorted should be (data)
  }
}