diff options
-rw-r--r-- | R/pkg/NAMESPACE | 1 | ||||
-rw-r--r-- | R/pkg/R/generics.R | 7 | ||||
-rw-r--r-- | R/pkg/R/stats.R | 39 | ||||
-rw-r--r-- | R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 |
4 files changed, 55 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 6a3d63f43f..636d39e1e9 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -111,6 +111,7 @@ exportMethods("%in%", "add_months", "alias", "approxCountDistinct", + "approxQuantile", "array_contains", "asc", "ascii", diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ab61bce03d..3db72b5795 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) # @export setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) +# @rdname statfunctions +# @export +setGeneric("approxQuantile", + function(x, col, probabilities, relativeError) { + standardGeneric("approxQuantile") + }) + # @rdname distinct # @export setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 2e8076843f..edf72937c6 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"), collect(dataFrame(sct)) }) +#' approxQuantile +#' +#' Calculates the approximate quantiles of a numerical column of a DataFrame. +#' +#' The result of this algorithm has the following deterministic bound: +#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error +#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank +#' of `x` is close to (p * N). More precisely, +#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). +#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed +#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 +#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. +#' +#' @param x A SparkSQL DataFrame. +#' @param col The name of the numerical column. +#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1]. +#' For example 0 is the minimum, 0.5 is the median, 1 is the maximum. +#' @param relativeError The relative target precision to achieve (>= 0). If set to zero, +#' the exact quantiles are computed, which could be very expensive. +#' Note that values greater than 1 are accepted but give the same result as 1. +#' @return The approximate quantiles at the given probabilities. +#' +#' @rdname statfunctions +#' @name approxQuantile +#' @export +#' @examples +#' \dontrun{ +#' df <- jsonFile(sqlContext, "/path/to/file.json") +#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0) +#' } +setMethod("approxQuantile", + signature(x = "DataFrame", col = "character", + probabilities = "numeric", relativeError = "numeric"), + function(x, col, probabilities, relativeError) { + statFunctions <- callJMethod(x@sdf, "stat") + callJMethod(statFunctions, "approxQuantile", col, + as.list(probabilities), relativeError) + }) + #' sampleBy #' #' Returns a stratified sample without replacement based on the fraction given on each stratum. diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index cc118108f6..236bae6bde 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", { expect_identical(as.list(result[2, ]), list(key = "1", count = 7)) }) +test_that("approxQuantile() on a DataFrame", { + l <- lapply(c(0:99), function(i) { i }) + df <- createDataFrame(sqlContext, l, "key") + quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0) + expect_equal(quantiles[[1]], 50) + expect_equal(quantiles[[2]], 80) +}) + test_that("SQL error message is returned from JVM", { retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e) expect_equal(grepl("Table not found: blah", retError), TRUE) |