diff options
author | qhuang <qian.huang@intel.com> | 2015-05-05 20:39:56 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-05 20:39:56 -0700 |
commit | a4669443999dc13a1bb34509c827d8b9096ea84f (patch) | |
tree | 1d1b27d2c50cdee371ef530f77deaedc0d39549b /R | |
parent | 51b3d41e160a1326a04536241b427e65b39ed8df (diff) | |
download | spark-a4669443999dc13a1bb34509c827d8b9096ea84f.tar.gz spark-a4669443999dc13a1bb34509c827d8b9096ea84f.tar.bz2 spark-a4669443999dc13a1bb34509c827d8b9096ea84f.zip |
[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.
Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241
sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242)
Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841
Author: qhuang <qian.huang@intel.com>
Closes #5446 from hqzizania/R and squashes the following commits:
f283572 [qhuang] add test unit for describe()
2e74d5a [qhuang] add describe() DataFrame API
Diffstat (limited to 'R')
-rw-r--r-- | R/pkg/NAMESPACE | 1 | ||||
-rw-r--r-- | R/pkg/R/DataFrame.R | 37 | ||||
-rw-r--r-- | R/pkg/R/generics.R | 4 | ||||
-rw-r--r-- | R/pkg/inst/tests/test_sparkSQL.R | 11 |
4 files changed, 53 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1fb3311b7f..528e6608c3 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -13,6 +13,7 @@ exportMethods("cache", "collect", "columns", "count", + "describe", "distinct", "dtypes", "except", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 841e77e55e..56c305d912 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1276,3 +1276,40 @@ setMethod("saveAsTable", callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options) }) +#' describe +#' +#' Computes statistics for numeric columns. +#' If no columns are given, this function computes statistics for all numerical columns. +#' +#' @param x A DataFrame to be computed. +#' @param col A string of name +#' @param ... Additional expressions +#' @return A DataFrame +#' @rdname describe +#' @export +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlCtx <- sparkRSQL.init(sc) +#' path <- "path/to/file.json" +#' df <- jsonFile(sqlCtx, path) +#' describe(df) +#' describe(df, "col1") +#' describe(df, "col1", "col2") +#' } +setMethod("describe", + signature(x = "DataFrame", col = "character"), + function(x, col, ...) { + colList <- list(col, ...) + sdf <- callJMethod(x@sdf, "describe", listToSeq(colList)) + dataFrame(sdf) + }) + +#' @rdname describe +setMethod("describe", + signature(x = "DataFrame"), + function(x) { + colList <- as.list(c(columns(x))) + sdf <- callJMethod(x@sdf, "describe", listToSeq(colList)) + dataFrame(sdf) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e88729387e..5838955f74 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) +#' @rdname describe +#' @export +setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) + #' @rdname schema #' @export setGeneric("dtypes", function(x) { standardGeneric("dtypes") }) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index f82e56fdd8..7a42e289fc 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", { expect_true(count(parquetDF) == count(df)*2) }) +test_that("describe() on a DataFrame", { + df <- jsonFile(sqlCtx, jsonPath) + stats <- describe(df, "age") + expect_true(collect(stats)[1, "summary"] == "count") + expect_true(collect(stats)[2, "age"] == 24.5) + expect_true(collect(stats)[3, "age"] == 5.5) + stats <- describe(df) + expect_true(collect(stats)[4, "name"] == "Andy") + expect_true(collect(stats)[5, "age"] == 30.0) +}) + unlink(parquetPath) unlink(jsonPath) |