aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfelixcheung <felixcheung_m@hotmail.com>2015-08-25 23:48:16 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2015-08-25 23:48:27 -0700
commit5220db9e352b5d5eae59cead9478ca0a9f73f16b (patch)
treeb29a915aea82aecba360f36294e2a7314925dbe0
parent21a10a86d20ec1a6fea42286b4d2aae9ce7e848d (diff)
downloadspark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.gz
spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.bz2
spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.zip
[SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)
Add support for ``` df[df$name == "Smith", c(1,2)] df[df$age %in% c(19, 30), 1:2] ``` shivaram Author: felixcheung <felixcheung_m@hotmail.com> Closes #8394 from felixcheung/rsubset. (cherry picked from commit 75d4773aa50e24972c533e8b48697fde586429eb) Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
-rw-r--r--R/pkg/R/DataFrame.R22
-rw-r--r--R/pkg/inst/tests/test_sparkSQL.R27
2 files changed, 48 insertions, 1 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 10f3c4ea59..1d870ec99d 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -954,9 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"),
x
})
+setClassUnion("numericOrcharacter", c("numeric", "character"))
+
#' @rdname select
#' @name [[
-setMethod("[[", signature(x = "DataFrame"),
+setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
function(x, i) {
if (is.numeric(i)) {
cols <- columns(x)
@@ -979,6 +981,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
select(x, j)
})
+#' @rdname select
+#' @name [
+setMethod("[", signature(x = "DataFrame", i = "Column"),
+ function(x, i, j, ...) {
+ # It could handle i as "character" but it seems confusing and not required
+ # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
+ filtered <- filter(x, i)
+ if (!missing(j)) {
+ filtered[, j]
+ } else {
+ filtered
+ }
+ })
+
#' Select
#'
#' Selects a set of columns with names or Column expressions.
@@ -997,8 +1013,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
#' # Columns can also be selected using `[[` and `[`
#' df[[2]] == df[["age"]]
#' df[,2] == df[,"age"]
+#' df[,c("name", "age")]
#' # Similar to R data frames columns can also be selected using `$`
#' df$age
+#' # It can also be subset on rows and Columns
+#' df[df$name == "Smith", c(1,2)]
+#' df[df$age %in% c(19, 30), 1:2]
#' }
setMethod("select", signature(x = "DataFrame", col = "character"),
function(x, col, ...) {
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 556b8c5447..ee48a3dc0c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -587,6 +587,33 @@ test_that("select with column", {
expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
})
+test_that("subsetting", {
+ # jsonFile returns columns in random order
+ df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+ filtered <- df[df$age > 20,]
+ expect_equal(count(filtered), 1)
+ expect_equal(columns(filtered), c("name", "age"))
+ expect_equal(collect(filtered)$name, "Andy")
+
+ df2 <- df[df$age == 19, 1]
+ expect_is(df2, "DataFrame")
+ expect_equal(count(df2), 1)
+ expect_equal(columns(df2), c("name"))
+ expect_equal(collect(df2)$name, "Justin")
+
+ df3 <- df[df$age > 20, 2]
+ expect_equal(count(df3), 1)
+ expect_equal(columns(df3), c("age"))
+
+ df4 <- df[df$age %in% c(19, 30), 1:2]
+ expect_equal(count(df4), 2)
+ expect_equal(columns(df4), c("name", "age"))
+
+ df5 <- df[df$age %in% c(19), c(1,2)]
+ expect_equal(count(df5), 1)
+ expect_equal(columns(df5), c("name", "age"))
+})
+
test_that("selectExpr() on a DataFrame", {
df <- jsonFile(sqlContext, jsonPath)
selected <- selectExpr(df, "age * 2")