aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-02-25 21:23:41 -0800
committerXiangrui Meng <meng@databricks.com>2016-02-25 21:23:41 -0800
commit50e60e36f7775a10cf39338e7c5716578a24d89f (patch)
treee0e0a09f875576531ef7f33e81004680fae5041b /R
parentf3be369ef7b78944ce9cafc94b19df52772cea58 (diff)
downloadspark-50e60e36f7775a10cf39338e7c5716578a24d89f.tar.gz
spark-50e60e36f7775a10cf39338e7c5716578a24d89f.tar.bz2
spark-50e60e36f7775a10cf39338e7c5716578a24d89f.zip
[SPARK-13504] [SPARKR] Add approxQuantile for SparkR
## What changes were proposed in this pull request? Add ```approxQuantile``` for SparkR. ## How was this patch tested? unit tests Author: Yanbo Liang <ybliang8@gmail.com> Closes #11383 from yanboliang/spark-13504 and squashes the following commits: 4f17adb [Yanbo Liang] Add approxQuantile for SparkR
Diffstat (limited to 'R')
-rw-r--r--R/pkg/NAMESPACE1
-rw-r--r--R/pkg/R/generics.R7
-rw-r--r--R/pkg/R/stats.R39
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R8
4 files changed, 55 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 6a3d63f43f..636d39e1e9 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -111,6 +111,7 @@ exportMethods("%in%",
"add_months",
"alias",
"approxCountDistinct",
+ "approxQuantile",
"array_contains",
"asc",
"ascii",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index ab61bce03d..3db72b5795 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
# @export
setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
+# @rdname statfunctions
+# @export
+setGeneric("approxQuantile",
+ function(x, col, probabilities, relativeError) {
+ standardGeneric("approxQuantile")
+ })
+
# @rdname distinct
# @export
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 2e8076843f..edf72937c6 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
collect(dataFrame(sct))
})
+#' approxQuantile
+#'
+#' Calculates the approximate quantiles of a numerical column of a DataFrame.
+#'
+#' The result of this algorithm has the following deterministic bound:
+#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error
+#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank
+#' of `x` is close to (p * N). More precisely,
+#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
+#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param col The name of the numerical column.
+#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
+#' For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
+#' the exact quantiles are computed, which could be very expensive.
+#' Note that values greater than 1 are accepted but give the same result as 1.
+#' @return The approximate quantiles at the given probabilities.
+#'
+#' @rdname statfunctions
+#' @name approxQuantile
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+#' }
+setMethod("approxQuantile",
+ signature(x = "DataFrame", col = "character",
+ probabilities = "numeric", relativeError = "numeric"),
+ function(x, col, probabilities, relativeError) {
+ statFunctions <- callJMethod(x@sdf, "stat")
+ callJMethod(statFunctions, "approxQuantile", col,
+ as.list(probabilities), relativeError)
+ })
+
#' sampleBy
#'
#' Returns a stratified sample without replacement based on the fraction given on each stratum.
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index cc118108f6..236bae6bde 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", {
expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
})
+test_that("approxQuantile() on a DataFrame", {
+ l <- lapply(c(0:99), function(i) { i })
+ df <- createDataFrame(sqlContext, l, "key")
+ quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+ expect_equal(quantiles[[1]], 50)
+ expect_equal(quantiles[[2]], 80)
+})
+
test_that("SQL error message is returned from JVM", {
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
expect_equal(grepl("Table not found: blah", retError), TRUE)