aboutsummaryrefslogtreecommitdiff
path: root/core/src/test
diff options
context:
space:
mode:
authorReynold Xin <rxin@apache.org>2014-07-18 12:41:50 -0700
committerReynold Xin <rxin@apache.org>2014-07-18 12:41:50 -0700
commit586e716e47305cd7c2c3ff35c0e828b63ef2f6a8 (patch)
tree0ebe507c73e48c12a4f7863b49e177c17c539098 /core/src/test
parent7f87ab98138d00723e007471f1a7f506650978cb (diff)
downloadspark-586e716e47305cd7c2c3ff35c0e828b63ef2f6a8.tar.gz
spark-586e716e47305cd7c2c3ff35c0e828b63ef2f6a8.tar.bz2
spark-586e716e47305cd7c2c3ff35c0e828b63ef2f6a8.zip
Reservoir sampling implementation.
This is going to be used in https://issues.apache.org/jira/browse/SPARK-2568 Author: Reynold Xin <rxin@apache.org> Closes #1478 from rxin/reservoirSample and squashes the following commits: 17bcbf3 [Reynold Xin] Added seed. badf20d [Reynold Xin] Renamed the method. 6940010 [Reynold Xin] Reservoir sampling implementation.
Diffstat (limited to 'core/src/test')
-rw-r--r--core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala21
1 files changed, 21 insertions, 0 deletions
diff --git a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
index accfe2e9b7..73a9d029b0 100644
--- a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
@@ -17,11 +17,32 @@
package org.apache.spark.util.random
+import scala.util.Random
+
import org.apache.commons.math3.distribution.{BinomialDistribution, PoissonDistribution}
import org.scalatest.FunSuite
class SamplingUtilsSuite extends FunSuite {
+ test("reservoirSampleAndCount") {
+ val input = Seq.fill(100)(Random.nextInt())
+
+ // input size < k
+ val (sample1, count1) = SamplingUtils.reservoirSampleAndCount(input.iterator, 150)
+ assert(count1 === 100)
+ assert(input === sample1.toSeq)
+
+ // input size == k
+ val (sample2, count2) = SamplingUtils.reservoirSampleAndCount(input.iterator, 100)
+ assert(count2 === 100)
+ assert(input === sample2.toSeq)
+
+ // input size > k
+ val (sample3, count3) = SamplingUtils.reservoirSampleAndCount(input.iterator, 10)
+ assert(count3 === 100)
+ assert(sample3.length === 10)
+ }
+
test("computeFraction") {
// test that the computed fraction guarantees enough data points
// in the sample with a failure rate <= 0.0001