[SPARK-18678][ML] Skewed reservoir sampling in SamplingUtils

## What changes were proposed in this pull request? Fix reservoir sampling bias for small k. An off-by-one error meant that the probability of replacement was slightly too high -- k/(l-1) after l element instead of k/l, which matters for small k. ## How was this patch tested? Existing test plus new test case. Author: Sean Owen <sowen@cloudera.com> Closes #16129 from srowen/SPARK-18678.
author: Sean Owen <sowen@cloudera.com> 2016-12-07 17:34:45 +0800
committer: Sean Owen <sowen@cloudera.com> 2016-12-07 17:34:45 +0800
commit: 79f5f281bb69cb2de9f64006180abd753e8ae427 (patch)
tree: a722ebc403cc4655e96e48b9e3de7502e04271a0 /R
parent: b8280271396eb74638da6546d76bbb2d06c7011b (diff)
download: spark-79f5f281bb69cb2de9f64006180abd753e8ae427.tar.gz
spark-79f5f281bb69cb2de9f64006180abd753e8ae427.tar.bz2
spark-79f5f281bb69cb2de9f64006180abd753e8ae427.zip
1 files changed, 5 insertions, 4 deletions
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 0802a2ae48..4758e40e41 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -1007,10 +1007,11 @@ test_that("spark.randomForest", {
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 20, seed = 123)
   predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.379, 61.096, 60.636, 62.258,
-                                         63.736, 64.296, 64.868, 64.300,
-                                         66.709, 67.697, 67.966, 67.252,
-                                         68.866, 69.593, 69.195, 69.658),
+  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
+                                         63.53160, 64.05470, 65.12710, 64.30450,
+                                         66.70910, 67.86125, 68.08700, 67.21865,
+                                         68.89275, 69.53180, 69.39640, 69.68250),
+
                tolerance = 1e-4)
   stats <- summary(model)
   expect_equal(stats$numTrees, 20)
author	Sean Owen <sowen@cloudera.com>	2016-12-07 17:34:45 +0800
committer	Sean Owen <sowen@cloudera.com>	2016-12-07 17:34:45 +0800
commit	79f5f281bb69cb2de9f64006180abd753e8ae427 (patch)
tree	a722ebc403cc4655e96e48b9e3de7502e04271a0 /R
parent	b8280271396eb74638da6546d76bbb2d06c7011b (diff)
download	spark-79f5f281bb69cb2de9f64006180abd753e8ae427.tar.gz spark-79f5f281bb69cb2de9f64006180abd753e8ae427.tar.bz2 spark-79f5f281bb69cb2de9f64006180abd753e8ae427.zip