From 586e716e47305cd7c2c3ff35c0e828b63ef2f6a8 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@apache.org>
Date: Fri, 18 Jul 2014 12:41:50 -0700
Subject: Reservoir sampling implementation.

This is going to be used in https://issues.apache.org/jira/browse/SPARK-2568

Author: Reynold Xin <rxin@apache.org>

Closes #1478 from rxin/reservoirSample and squashes the following commits:

17bcbf3 [Reynold Xin] Added seed.
badf20d [Reynold Xin] Renamed the method.
6940010 [Reynold Xin] Reservoir sampling implementation.
---
 .../spark/util/random/SamplingUtilsSuite.scala      | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'core/src/test')

diff --git a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
index accfe2e9b7..73a9d029b0 100644
--- a/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/random/SamplingUtilsSuite.scala
@@ -17,11 +17,32 @@
 
 package org.apache.spark.util.random
 
+import scala.util.Random
+
 import org.apache.commons.math3.distribution.{BinomialDistribution, PoissonDistribution}
 import org.scalatest.FunSuite
 
 class SamplingUtilsSuite extends FunSuite {
 
+  test("reservoirSampleAndCount") {
+    val input = Seq.fill(100)(Random.nextInt())
+
+    // input size < k
+    val (sample1, count1) = SamplingUtils.reservoirSampleAndCount(input.iterator, 150)
+    assert(count1 === 100)
+    assert(input === sample1.toSeq)
+
+    // input size == k
+    val (sample2, count2) = SamplingUtils.reservoirSampleAndCount(input.iterator, 100)
+    assert(count2 === 100)
+    assert(input === sample2.toSeq)
+
+    // input size > k
+    val (sample3, count3) = SamplingUtils.reservoirSampleAndCount(input.iterator, 10)
+    assert(count3 === 100)
+    assert(sample3.length === 10)
+  }
+
   test("computeFraction") {
     // test that the computed fraction guarantees enough data points
     // in the sample with a failure rate <= 0.0001
-- 
cgit v1.2.3