diff options
author | felixcheung <felixcheung_m@hotmail.com> | 2015-08-25 23:48:16 -0700 |
---|---|---|
committer | Shivaram Venkataraman <shivaram@cs.berkeley.edu> | 2015-08-25 23:48:27 -0700 |
commit | 5220db9e352b5d5eae59cead9478ca0a9f73f16b (patch) | |
tree | b29a915aea82aecba360f36294e2a7314925dbe0 | |
parent | 21a10a86d20ec1a6fea42286b4d2aae9ce7e848d (diff) | |
download | spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.gz spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.bz2 spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.zip |
[SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)
Add support for
```
df[df$name == "Smith", c(1,2)]
df[df$age %in% c(19, 30), 1:2]
```
shivaram
Author: felixcheung <felixcheung_m@hotmail.com>
Closes #8394 from felixcheung/rsubset.
(cherry picked from commit 75d4773aa50e24972c533e8b48697fde586429eb)
Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
-rw-r--r-- | R/pkg/R/DataFrame.R | 22 | ||||
-rw-r--r-- | R/pkg/inst/tests/test_sparkSQL.R | 27 |
2 files changed, 48 insertions, 1 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 10f3c4ea59..1d870ec99d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -954,9 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"), x }) +setClassUnion("numericOrcharacter", c("numeric", "character")) + #' @rdname select #' @name [[ -setMethod("[[", signature(x = "DataFrame"), +setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), function(x, i) { if (is.numeric(i)) { cols <- columns(x) @@ -979,6 +981,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), select(x, j) }) +#' @rdname select +#' @name [ +setMethod("[", signature(x = "DataFrame", i = "Column"), + function(x, i, j, ...) { + # It could handle i as "character" but it seems confusing and not required + # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html + filtered <- filter(x, i) + if (!missing(j)) { + filtered[, j] + } else { + filtered + } + }) + #' Select #' #' Selects a set of columns with names or Column expressions. @@ -997,8 +1013,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' # Columns can also be selected using `[[` and `[` #' df[[2]] == df[["age"]] #' df[,2] == df[,"age"] +#' df[,c("name", "age")] #' # Similar to R data frames columns can also be selected using `$` #' df$age +#' # It can also be subset on rows and Columns +#' df[df$name == "Smith", c(1,2)] +#' df[df$age %in% c(19, 30), 1:2] #' } setMethod("select", signature(x = "DataFrame", col = "character"), function(x, col, ...) { diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 556b8c5447..ee48a3dc0c 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -587,6 +587,33 @@ test_that("select with column", { expect_equal(collect(select(df3, "x"))[[1, 1]], "x") }) +test_that("subsetting", { + # jsonFile returns columns in random order + df <- select(jsonFile(sqlContext, jsonPath), "name", "age") + filtered <- df[df$age > 20,] + expect_equal(count(filtered), 1) + expect_equal(columns(filtered), c("name", "age")) + expect_equal(collect(filtered)$name, "Andy") + + df2 <- df[df$age == 19, 1] + expect_is(df2, "DataFrame") + expect_equal(count(df2), 1) + expect_equal(columns(df2), c("name")) + expect_equal(collect(df2)$name, "Justin") + + df3 <- df[df$age > 20, 2] + expect_equal(count(df3), 1) + expect_equal(columns(df3), c("age")) + + df4 <- df[df$age %in% c(19, 30), 1:2] + expect_equal(count(df4), 2) + expect_equal(columns(df4), c("name", "age")) + + df5 <- df[df$age %in% c(19), c(1,2)] + expect_equal(count(df5), 1) + expect_equal(columns(df5), c("name", "age")) +}) + test_that("selectExpr() on a DataFrame", { df <- jsonFile(sqlContext, jsonPath) selected <- selectExpr(df, "age * 2") |