aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfelixcheung <felixcheung_m@hotmail.com>2015-08-28 18:35:01 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-08-28 18:35:13 -0700
commitb7aab1d1838bdffdf29923fc0f18eb04e582957e (patch)
treefb573e7fc44ceb04d648495eab66e35ee86aab74
parentdf4a2e68ad4117ffadd63f5032ff8ca4972c2772 (diff)
downloadspark-b7aab1d1838bdffdf29923fc0f18eb04e582957e.tar.gz
spark-b7aab1d1838bdffdf29923fc0f18eb04e582957e.tar.bz2
spark-b7aab1d1838bdffdf29923fc0f18eb04e582957e.zip
[SPARK-9803] [SPARKR] Add subset and transform + tests
Add subset and transform Also reorganize `[` & `[[` to subset instead of select Note: for transform, transform is very similar to mutate. Spark doesn't seem to replace existing column with the name in mutate (ie. `mutate(df, age = df$age + 2)` - returned DataFrame has 2 columns with the same name 'age'), so therefore not doing that for now in transform. Though it is clearly stated it should replace column with matching name (should I open a JIRA for mutate/transform?) Author: felixcheung <felixcheung_m@hotmail.com> Closes #8503 from felixcheung/rsubset_transform. (cherry picked from commit 2a4e00ca4d4e7a148b4ff8ce0ad1c6d517cee55f) Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
-rw-r--r--R/pkg/NAMESPACE2
-rw-r--r--R/pkg/R/DataFrame.R70
-rw-r--r--R/pkg/R/generics.R10
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R20
4 files changed, 85 insertions, 17 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5286c01986..9d39630706 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -69,9 +69,11 @@ exportMethods("arrange",
"selectExpr",
"show",
"showDF",
+ "subset",
"summarize",
"summary",
"take",
+ "transform",
"unionAll",
"unique",
"unpersist",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 19cf1a9e07..f833e70a0f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -956,7 +956,7 @@ setMethod("$<-", signature(x = "DataFrame"),
setClassUnion("numericOrcharacter", c("numeric", "character"))
-#' @rdname select
+#' @rdname subset
#' @name [[
setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
function(x, i) {
@@ -967,7 +967,7 @@ setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
getColumn(x, i)
})
-#' @rdname select
+#' @rdname subset
#' @name [
setMethod("[", signature(x = "DataFrame", i = "missing"),
function(x, i, j, ...) {
@@ -981,7 +981,7 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
select(x, j)
})
-#' @rdname select
+#' @rdname subset
#' @name [
setMethod("[", signature(x = "DataFrame", i = "Column"),
function(x, i, j, ...) {
@@ -989,12 +989,43 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
# https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
filtered <- filter(x, i)
if (!missing(j)) {
- filtered[, j]
+ filtered[, j, ...]
} else {
filtered
}
})
+#' Subset
+#'
+#' Return subsets of DataFrame according to given conditions
+#' @param x A DataFrame
+#' @param subset A logical expression to filter on rows
+#' @param select expression for the single Column or a list of columns to select from the DataFrame
+#' @return A new DataFrame containing only the rows that meet the condition with selected columns
+#' @export
+#' @rdname subset
+#' @name subset
+#' @aliases [
+#' @family subsetting functions
+#' @examples
+#' \dontrun{
+#' # Columns can be selected using `[[` and `[`
+#' df[[2]] == df[["age"]]
+#' df[,2] == df[,"age"]
+#' df[,c("name", "age")]
+#' # Or to filter rows
+#' df[df$age > 20,]
+#' # DataFrame can be subset on both rows and Columns
+#' df[df$name == "Smith", c(1,2)]
+#' df[df$age %in% c(19, 30), 1:2]
+#' subset(df, df$age %in% c(19, 30), 1:2)
+#' subset(df, df$age %in% c(19), select = c(1,2))
+#' }
+setMethod("subset", signature(x = "DataFrame"),
+ function(x, subset, select, ...) {
+ x[subset, select, ...]
+ })
+
#' Select
#'
#' Selects a set of columns with names or Column expressions.
@@ -1003,6 +1034,8 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
#' @return A new DataFrame with selected columns
#' @export
#' @rdname select
+#' @name select
+#' @family subsetting functions
#' @examples
#' \dontrun{
#' select(df, "*")
@@ -1010,15 +1043,8 @@ setMethod("[", signature(x = "DataFrame", i = "Column"),
#' select(df, df$name, df$age + 1)
#' select(df, c("col1", "col2"))
#' select(df, list(df$name, df$age + 1))
-#' # Columns can also be selected using `[[` and `[`
-#' df[[2]] == df[["age"]]
-#' df[,2] == df[,"age"]
-#' df[,c("name", "age")]
#' # Similar to R data frames columns can also be selected using `$`
#' df$age
-#' # It can also be subset on rows and Columns
-#' df[df$name == "Smith", c(1,2)]
-#' df[df$age %in% c(19, 30), 1:2]
#' }
setMethod("select", signature(x = "DataFrame", col = "character"),
function(x, col, ...) {
@@ -1090,7 +1116,7 @@ setMethod("selectExpr",
#' @return A DataFrame with the new column added.
#' @rdname withColumn
#' @name withColumn
-#' @aliases mutate
+#' @aliases mutate transform
#' @export
#' @examples
#'\dontrun{
@@ -1110,11 +1136,12 @@ setMethod("withColumn",
#'
#' Return a new DataFrame with the specified columns added.
#'
-#' @param x A DataFrame
+#' @param .data A DataFrame
#' @param col a named argument of the form name = col
#' @return A new DataFrame with the new columns added.
#' @rdname withColumn
#' @name mutate
+#' @aliases withColumn transform
#' @export
#' @examples
#'\dontrun{
@@ -1124,10 +1151,12 @@ setMethod("withColumn",
#' df <- jsonFile(sqlContext, path)
#' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2)
#' names(newDF) # Will contain newCol, newCol2
+#' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2)
#' }
setMethod("mutate",
- signature(x = "DataFrame"),
- function(x, ...) {
+ signature(.data = "DataFrame"),
+ function(.data, ...) {
+ x <- .data
cols <- list(...)
stopifnot(length(cols) > 0)
stopifnot(class(cols[[1]]) == "Column")
@@ -1142,6 +1171,16 @@ setMethod("mutate",
do.call(select, c(x, x$"*", cols))
})
+#' @export
+#' @rdname withColumn
+#' @name transform
+#' @aliases withColumn mutate
+setMethod("transform",
+ signature(`_data` = "DataFrame"),
+ function(`_data`, ...) {
+ mutate(`_data`, ...)
+ })
+
#' WithColumnRenamed
#'
#' Rename an existing column in a DataFrame.
@@ -1269,6 +1308,7 @@ setMethod("orderBy",
#' @return A DataFrame containing only the rows that meet the condition.
#' @rdname filter
#' @name filter
+#' @family subsetting functions
#' @export
#' @examples
#'\dontrun{
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index b578b8789d..43dd8d283a 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -467,7 +467,7 @@ setGeneric("merge")
#' @rdname withColumn
#' @export
-setGeneric("mutate", function(x, ...) {standardGeneric("mutate") })
+setGeneric("mutate", function(.data, ...) {standardGeneric("mutate") })
#' @rdname arrange
#' @export
@@ -507,6 +507,10 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
standardGeneric("saveAsTable")
})
+#' @rdname withColumn
+#' @export
+setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
+
#' @rdname write.df
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
@@ -531,6 +535,10 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr")
#' @export
setGeneric("showDF", function(x,...) { standardGeneric("showDF") })
+# @rdname subset
+# @export
+setGeneric("subset", function(x, subset, select, ...) { standardGeneric("subset") })
+
#' @rdname agg
#' @export
setGeneric("summarize", function(x,...) { standardGeneric("summarize") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 8ebd78ed61..2062bc72b1 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -612,6 +612,10 @@ test_that("subsetting", {
df5 <- df[df$age %in% c(19), c(1,2)]
expect_equal(count(df5), 1)
expect_equal(columns(df5), c("name", "age"))
+
+ df6 <- subset(df, df$age %in% c(30), c(1,2))
+ expect_equal(count(df6), 1)
+ expect_equal(columns(df6), c("name", "age"))
})
test_that("selectExpr() on a DataFrame", {
@@ -1028,7 +1032,7 @@ test_that("withColumn() and withColumnRenamed()", {
expect_equal(columns(newDF2)[1], "newerAge")
})
-test_that("mutate(), rename() and names()", {
+test_that("mutate(), transform(), rename() and names()", {
df <- jsonFile(sqlContext, jsonPath)
newDF <- mutate(df, newAge = df$age + 2)
expect_equal(length(columns(newDF)), 3)
@@ -1042,6 +1046,20 @@ test_that("mutate(), rename() and names()", {
names(newDF2) <- c("newerName", "evenNewerAge")
expect_equal(length(names(newDF2)), 2)
expect_equal(names(newDF2)[1], "newerName")
+
+ transformedDF <- transform(df, newAge = -df$age, newAge2 = df$age / 2)
+ expect_equal(length(columns(transformedDF)), 4)
+ expect_equal(columns(transformedDF)[3], "newAge")
+ expect_equal(columns(transformedDF)[4], "newAge2")
+ expect_equal(first(filter(transformedDF, transformedDF$name == "Andy"))$newAge, -30)
+
+ # test if transform on local data frames works
+ # ensure the proper signature is used - otherwise this will fail to run
+ attach(airquality)
+ result <- transform(Ozone, logOzone = log(Ozone))
+ expect_equal(nrow(result), 153)
+ expect_equal(ncol(result), 2)
+ detach(airquality)
})
test_that("write.df() on DataFrame and works with parquetFile", {