From 79f5f281bb69cb2de9f64006180abd753e8ae427 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Wed, 7 Dec 2016 17:34:45 +0800 Subject: [SPARK-18678][ML] Skewed reservoir sampling in SamplingUtils ## What changes were proposed in this pull request? Fix reservoir sampling bias for small k. An off-by-one error meant that the probability of replacement was slightly too high -- k/(l-1) after l element instead of k/l, which matters for small k. ## How was this patch tested? Existing test plus new test case. Author: Sean Owen Closes #16129 from srowen/SPARK-18678. --- R/pkg/inst/tests/testthat/test_mllib.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'R/pkg/inst/tests') diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 0802a2ae48..4758e40e41 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -1007,10 +1007,11 @@ test_that("spark.randomForest", { model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, numTrees = 20, seed = 123) predictions <- collect(predict(model, data)) - expect_equal(predictions$prediction, c(60.379, 61.096, 60.636, 62.258, - 63.736, 64.296, 64.868, 64.300, - 66.709, 67.697, 67.966, 67.252, - 68.866, 69.593, 69.195, 69.658), + expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070, + 63.53160, 64.05470, 65.12710, 64.30450, + 66.70910, 67.86125, 68.08700, 67.21865, + 68.89275, 69.53180, 69.39640, 69.68250), + tolerance = 1e-4) stats <- summary(model) expect_equal(stats$numTrees, 20) -- cgit v1.2.3