[SPARK-13504] [SPARKR] Add approxQuantile for SparkR

## What changes were proposed in this pull request? Add ```approxQuantile``` for SparkR. ## How was this patch tested? unit tests Author: Yanbo Liang <ybliang8@gmail.com> Closes #11383 from yanboliang/spark-13504 and squashes the following commits: 4f17adb [Yanbo Liang] Add approxQuantile for SparkR
author: Yanbo Liang <ybliang8@gmail.com> 2016-02-25 21:23:41 -0800
committer: Xiangrui Meng <meng@databricks.com> 2016-02-25 21:23:41 -0800
commit: 50e60e36f7775a10cf39338e7c5716578a24d89f (patch)
tree: e0e0a09f875576531ef7f33e81004680fae5041b /R
parent: f3be369ef7b78944ce9cafc94b19df52772cea58 (diff)
download: spark-50e60e36f7775a10cf39338e7c5716578a24d89f.tar.gz
spark-50e60e36f7775a10cf39338e7c5716578a24d89f.tar.bz2
spark-50e60e36f7775a10cf39338e7c5716578a24d89f.zip
4 files changed, 55 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 6a3d63f43f..636d39e1e9 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -111,6 +111,7 @@ exportMethods("%in%",
               "add_months",
               "alias",
               "approxCountDistinct",
+              "approxQuantile",
               "array_contains",
               "asc",
               "ascii",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index ab61bce03d..3db72b5795 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 # @export
 setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
 
+# @rdname statfunctions
+# @export
+setGeneric("approxQuantile",
+           function(x, col, probabilities, relativeError) {
+             standardGeneric("approxQuantile")
+           })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 2e8076843f..edf72937c6 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
             collect(dataFrame(sct))
           })
 
+#' approxQuantile
+#'
+#' Calculates the approximate quantiles of a numerical column of a DataFrame.
+#'
+#' The result of this algorithm has the following deterministic bound:
+#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error
+#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank
+#' of `x` is close to (p * N). More precisely,
+#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
+#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param col The name of the numerical column.
+#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
+#'                      For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
+#'                      the exact quantiles are computed, which could be very expensive.
+#'                      Note that values greater than 1 are accepted but give the same result as 1.
+#' @return The approximate quantiles at the given probabilities.
+#'
+#' @rdname statfunctions
+#' @name approxQuantile
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+#' }
+setMethod("approxQuantile",
+          signature(x = "DataFrame", col = "character",
+                    probabilities = "numeric", relativeError = "numeric"),
+          function(x, col, probabilities, relativeError) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            callJMethod(statFunctions, "approxQuantile", col,
+                        as.list(probabilities), relativeError)
+          })
+
 #' sampleBy
 #'
 #' Returns a stratified sample without replacement based on the fraction given on each stratum.
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index cc118108f6..236bae6bde 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", {
   expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
 })
 
+test_that("approxQuantile() on a DataFrame", {
+  l <- lapply(c(0:99), function(i) { i })
+  df <- createDataFrame(sqlContext, l, "key")
+  quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+  expect_equal(quantiles[[1]], 50)
+  expect_equal(quantiles[[2]], 80)
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table not found: blah", retError), TRUE)
author	Yanbo Liang <ybliang8@gmail.com>	2016-02-25 21:23:41 -0800
committer	Xiangrui Meng <meng@databricks.com>	2016-02-25 21:23:41 -0800
commit	50e60e36f7775a10cf39338e7c5716578a24d89f (patch)
tree	e0e0a09f875576531ef7f33e81004680fae5041b /R
parent	f3be369ef7b78944ce9cafc94b19df52772cea58 (diff)
download	spark-50e60e36f7775a10cf39338e7c5716578a24d89f.tar.gz spark-50e60e36f7775a10cf39338e7c5716578a24d89f.tar.bz2 spark-50e60e36f7775a10cf39338e7c5716578a24d89f.zip