[SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)

Add support for ``` df[df$name == "Smith", c(1,2)] df[df$age %in% c(19, 30), 1:2] ``` shivaram Author: felixcheung <felixcheung_m@hotmail.com> Closes #8394 from felixcheung/rsubset. (cherry picked from commit 75d4773aa50e24972c533e8b48697fde586429eb) Signed-off-by: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
author: felixcheung <felixcheung_m@hotmail.com> 2015-08-25 23:48:16 -0700
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2015-08-25 23:48:27 -0700
commit: 5220db9e352b5d5eae59cead9478ca0a9f73f16b (patch)
tree: b29a915aea82aecba360f36294e2a7314925dbe0
parent: 21a10a86d20ec1a6fea42286b4d2aae9ce7e848d (diff)
download: spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.gz
spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.bz2
spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.zip
2 files changed, 48 insertions, 1 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 10f3c4ea59..1d870ec99d 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -954,9 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"),
             x
           })
 
+setClassUnion("numericOrcharacter", c("numeric", "character"))
+
 #' @rdname select
 #' @name [[
-setMethod("[[", signature(x = "DataFrame"),
+setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
           function(x, i) {
             if (is.numeric(i)) {
               cols <- columns(x)
@@ -979,6 +981,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
             select(x, j)
           })
 
+#' @rdname select
+#' @name [
+setMethod("[", signature(x = "DataFrame", i = "Column"),
+          function(x, i, j, ...) {
+            # It could handle i as "character" but it seems confusing and not required
+            # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
+            filtered <- filter(x, i)
+            if (!missing(j)) {
+              filtered[, j]
+            } else {
+              filtered
+            }
+          })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -997,8 +1013,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 #'   # Columns can also be selected using `[[` and `[`
 #'   df[[2]] == df[["age"]]
 #'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
+#'   # It can also be subset on rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
           function(x, col, ...) {
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 556b8c5447..ee48a3dc0c 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -587,6 +587,33 @@ test_that("select with column", {
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
 })
 
+test_that("subsetting", {
+  # jsonFile returns columns in random order
+  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  filtered <- df[df$age > 20,]
+  expect_equal(count(filtered), 1)
+  expect_equal(columns(filtered), c("name", "age"))
+  expect_equal(collect(filtered)$name, "Andy")
+
+  df2 <- df[df$age == 19, 1]
+  expect_is(df2, "DataFrame")
+  expect_equal(count(df2), 1)
+  expect_equal(columns(df2), c("name"))
+  expect_equal(collect(df2)$name, "Justin")
+
+  df3 <- df[df$age > 20, 2]
+  expect_equal(count(df3), 1)
+  expect_equal(columns(df3), c("age"))
+
+  df4 <- df[df$age %in% c(19, 30), 1:2]
+  expect_equal(count(df4), 2)
+  expect_equal(columns(df4), c("name", "age"))
+  
+  df5 <- df[df$age %in% c(19), c(1,2)]
+  expect_equal(count(df5), 1)
+  expect_equal(columns(df5), c("name", "age"))
+})
+
 test_that("selectExpr() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")
author	felixcheung <felixcheung_m@hotmail.com>	2015-08-25 23:48:16 -0700
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2015-08-25 23:48:27 -0700
commit	5220db9e352b5d5eae59cead9478ca0a9f73f16b (patch)
tree	b29a915aea82aecba360f36294e2a7314925dbe0
parent	21a10a86d20ec1a6fea42286b4d2aae9ce7e848d (diff)
download	spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.gz spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.tar.bz2 spark-5220db9e352b5d5eae59cead9478ca0a9f73f16b.zip