aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorqhuang <qian.huang@intel.com>2015-05-05 20:39:56 -0700
committerReynold Xin <rxin@databricks.com>2015-05-05 20:39:56 -0700
commita4669443999dc13a1bb34509c827d8b9096ea84f (patch)
tree1d1b27d2c50cdee371ef530f77deaedc0d39549b /R
parent51b3d41e160a1326a04536241b427e65b39ed8df (diff)
downloadspark-a4669443999dc13a1bb34509c827d8b9096ea84f.tar.gz
spark-a4669443999dc13a1bb34509c827d8b9096ea84f.tar.bz2
spark-a4669443999dc13a1bb34509c827d8b9096ea84f.zip
[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.
Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241 sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242) Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841 Author: qhuang <qian.huang@intel.com> Closes #5446 from hqzizania/R and squashes the following commits: f283572 [qhuang] add test unit for describe() 2e74d5a [qhuang] add describe() DataFrame API
Diffstat (limited to 'R')
-rw-r--r--R/pkg/NAMESPACE1
-rw-r--r--R/pkg/R/DataFrame.R37
-rw-r--r--R/pkg/R/generics.R4
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R11
4 files changed, 53 insertions, 0 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 1fb3311b7f..528e6608c3 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -13,6 +13,7 @@ exportMethods("cache",
"collect",
"columns",
"count",
+ "describe",
"distinct",
"dtypes",
"except",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 841e77e55e..56c305d912 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
})
+#' describe
+#'
+#' Computes statistics for numeric columns.
+#' If no columns are given, this function computes statistics for all numerical columns.
+#'
+#' @param x A DataFrame to be computed.
+#' @param col A string of name
+#' @param ... Additional expressions
+#' @return A DataFrame
+#' @rdname describe
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' describe(df)
+#' describe(df, "col1")
+#' describe(df, "col1", "col2")
+#' }
+setMethod("describe",
+ signature(x = "DataFrame", col = "character"),
+ function(x, col, ...) {
+ colList <- list(col, ...)
+ sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+ dataFrame(sdf)
+ })
+
+#' @rdname describe
+setMethod("describe",
+ signature(x = "DataFrame"),
+ function(x) {
+ colList <- as.list(c(columns(x)))
+ sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+ dataFrame(sdf)
+ })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e88729387e..5838955f74 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
#' @export
setGeneric("columns", function(x) {standardGeneric("columns") })
+#' @rdname describe
+#' @export
+setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
+
#' @rdname schema
#' @export
setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index f82e56fdd8..7a42e289fc 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
expect_true(count(parquetDF) == count(df)*2)
})
+test_that("describe() on a DataFrame", {
+ df <- jsonFile(sqlCtx, jsonPath)
+ stats <- describe(df, "age")
+ expect_true(collect(stats)[1, "summary"] == "count")
+ expect_true(collect(stats)[2, "age"] == 24.5)
+ expect_true(collect(stats)[3, "age"] == 5.5)
+ stats <- describe(df)
+ expect_true(collect(stats)[4, "name"] == "Andy")
+ expect_true(collect(stats)[5, "age"] == 30.0)
+})
+
unlink(parquetPath)
unlink(jsonPath)