From b40659838245ecaefb4e83d2ec6155f3f23a6675 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 17 Feb 2017 11:58:39 -0800 Subject: [SPARK-18285][SPARKR] SparkR approxQuantile supports input multiple columns ## What changes were proposed in this pull request? SparkR ```approxQuantile``` supports input multiple columns. ## How was this patch tested? Unit test. Author: Yanbo Liang Closes #16951 from yanboliang/spark-19619. --- R/pkg/R/generics.R | 2 +- R/pkg/R/stats.R | 25 +++++++++++++++++-------- R/pkg/inst/tests/testthat/test_sparkSQL.R | 18 +++++++++++++----- 3 files changed, 31 insertions(+), 14 deletions(-) (limited to 'R') diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 68864e6fe1..11940d3560 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -66,7 +66,7 @@ setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("fre # @rdname approxQuantile # @export setGeneric("approxQuantile", - function(x, col, probabilities, relativeError) { + function(x, cols, probabilities, relativeError) { standardGeneric("approxQuantile") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index dcd7198f41..8d1d165052 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -138,9 +138,9 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), collect(dataFrame(sct)) }) -#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame +#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame #' -#' Calculates the approximate quantiles of a numerical column of a SparkDataFrame. +#' Calculates the approximate quantiles of numerical columns of a SparkDataFrame. #' The result of this algorithm has the following deterministic bound: #' If the SparkDataFrame has N elements and if we request the quantile at probability p up to #' error err, then the algorithm will return a sample x from the SparkDataFrame so that the @@ -149,15 +149,19 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), #' This method implements a variation of the Greenwald-Khanna algorithm (with some speed #' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 #' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. +#' Note that rows containing any NA values will be removed before calculation. #' #' @param x A SparkDataFrame. -#' @param col The name of the numerical column. +#' @param cols A single column name, or a list of names for multiple columns. #' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1]. #' For example 0 is the minimum, 0.5 is the median, 1 is the maximum. #' @param relativeError The relative target precision to achieve (>= 0). If set to zero, #' the exact quantiles are computed, which could be very expensive. #' Note that values greater than 1 are accepted but give the same result as 1. -#' @return The approximate quantiles at the given probabilities. +#' @return The approximate quantiles at the given probabilities. If the input is a single column name, +#' the output is a list of approximate quantiles in that column; If the input is +#' multiple column names, the output should be a list, and each element in it is a list of +#' numeric values which represents the approximate quantiles in corresponding column. #' #' @rdname approxQuantile #' @name approxQuantile @@ -171,12 +175,17 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), #' } #' @note approxQuantile since 2.0.0 setMethod("approxQuantile", - signature(x = "SparkDataFrame", col = "character", + signature(x = "SparkDataFrame", cols = "character", probabilities = "numeric", relativeError = "numeric"), - function(x, col, probabilities, relativeError) { + function(x, cols, probabilities, relativeError) { statFunctions <- callJMethod(x@sdf, "stat") - callJMethod(statFunctions, "approxQuantile", col, - as.list(probabilities), relativeError) + quantiles <- callJMethod(statFunctions, "approxQuantile", as.list(cols), + as.list(probabilities), relativeError) + if (length(cols) == 1) { + quantiles[[1]] + } else { + quantiles + } }) #' Returns a stratified sample without replacement diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 199eb2057f..a7259f362e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2222,11 +2222,19 @@ test_that("sampleBy() on a DataFrame", { }) test_that("approxQuantile() on a DataFrame", { - l <- lapply(c(0:99), function(i) { i }) - df <- createDataFrame(l, "key") - quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0) - expect_equal(quantiles[[1]], 50) - expect_equal(quantiles[[2]], 80) + l <- lapply(c(0:99), function(i) { list(i, 99 - i) }) + df <- createDataFrame(l, list("a", "b")) + quantiles <- approxQuantile(df, "a", c(0.5, 0.8), 0.0) + expect_equal(quantiles, list(50, 80)) + quantiles2 <- approxQuantile(df, c("a", "b"), c(0.5, 0.8), 0.0) + expect_equal(quantiles2[[1]], list(50, 80)) + expect_equal(quantiles2[[2]], list(50, 80)) + + dfWithNA <- createDataFrame(data.frame(a = c(NA, 30, 19, 11, 28, 15), + b = c(-30, -19, NA, -11, -28, -15))) + quantiles3 <- approxQuantile(dfWithNA, c("a", "b"), c(0.5), 0.0) + expect_equal(quantiles3[[1]], list(28)) + expect_equal(quantiles3[[2]], list(-15)) }) test_that("SQL error message is returned from JVM", { -- cgit v1.2.3