[SPARK-13436][SPARKR] Added parameter drop to subsetting operator [

Added parameter drop to subsetting operator [. This is useful to get a Column from a DataFrame, given its name. R supports it. In R: ``` > name <- "Sepal_Length" > class(iris[, name]) [1] "numeric" ``` Currently, in SparkR: ``` > name <- "Sepal_Length" > class(irisDF[, name]) [1] "DataFrame" ``` Previous code returns a DataFrame, which is inconsistent with R's behavior. SparkR should return a Column instead. Currently, in order for the user to return a Column given a column name as a character variable would be through `eval(parse(x))`, where x is the string `"irisDF$Sepal_Length"`. That itself is pretty hacky. `SparkR:::getColumn() `is another choice, but I don't see why this method should be externalized. Instead, following R's way to do things, the proposed implementation allows this: ``` > name <- "Sepal_Length" > class(irisDF[, name, drop=T]) [1] "Column" > class(irisDF[, name, drop=F]) [1] "DataFrame" ``` This is consistent with R: ``` > name <- "Sepal_Length" > class(iris[, name]) [1] "numeric" > class(iris[, name, drop=F]) [1] "data.frame" ``` Author: Oscar D. Lara Yejas <odlaraye@oscars-mbp.usca.ibm.com> Author: Oscar D. Lara Yejas <odlaraye@oscars-mbp.attlocal.net> Closes #11318 from olarayej/SPARK-13436.
author: Oscar D. Lara Yejas <odlaraye@oscars-mbp.usca.ibm.com> 2016-04-27 15:47:54 -0700
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2016-04-27 15:47:54 -0700
commit: e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61 (patch)
tree: 58d4303824aca6fec6f9f6311f2dcf7a3cb1bd4e /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent: 37575115b98fdc9ebadb2ebcbcd9907a3af1076c (diff)
download: spark-e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61.tar.gz
spark-e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61.tar.bz2
spark-e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61.zip
1 files changed, 16 insertions, 8 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 336068035e..95d6cb8875 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -822,9 +822,10 @@ test_that("select operators", {
   expect_is(df[[2]], "Column")
   expect_is(df[["age"]], "Column")
 
-  expect_is(df[, 1], "SparkDataFrame")
-  expect_equal(columns(df[, 1]), c("name"))
-  expect_equal(columns(df[, "age"]), c("age"))
+  expect_is(df[, 1, drop = F], "SparkDataFrame")
+  expect_equal(columns(df[, 1, drop = F]), c("name"))
+  expect_equal(columns(df[, "age", drop = F]), c("age"))
+
   df2 <- df[, c("age", "name")]
   expect_is(df2, "SparkDataFrame")
   expect_equal(columns(df2), c("age", "name"))
@@ -835,6 +836,13 @@ test_that("select operators", {
   df$age2 <- df$age * 2
   expect_equal(columns(df), c("name", "age", "age2"))
   expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
+
+  # Test parameter drop
+  expect_equal(class(df[, 1]) == "SparkDataFrame", T)
+  expect_equal(class(df[, 1, drop = T]) == "Column", T)
+  expect_equal(class(df[, 1, drop = F]) == "SparkDataFrame", T)
+  expect_equal(class(df[df$age > 4, 2, drop = T]) == "Column", T)
+  expect_equal(class(df[df$age > 4, 2, drop = F]) == "SparkDataFrame", T)
 })
 
 test_that("select with column", {
@@ -889,13 +897,13 @@ test_that("subsetting", {
   expect_equal(columns(filtered), c("name", "age"))
   expect_equal(collect(filtered)$name, "Andy")
 
-  df2 <- df[df$age == 19, 1]
+  df2 <- df[df$age == 19, 1, drop = F]
   expect_is(df2, "SparkDataFrame")
   expect_equal(count(df2), 1)
   expect_equal(columns(df2), c("name"))
   expect_equal(collect(df2)$name, "Justin")
 
-  df3 <- df[df$age > 20, 2]
+  df3 <- df[df$age > 20, 2, drop = F]
   expect_equal(count(df3), 1)
   expect_equal(columns(df3), c("age"))
 
@@ -911,7 +919,7 @@ test_that("subsetting", {
   expect_equal(count(df6), 1)
   expect_equal(columns(df6), c("name", "age"))
 
-  df7 <- subset(df, select = "name")
+  df7 <- subset(df, select = "name", drop = F)
   expect_equal(count(df7), 3)
   expect_equal(columns(df7), c("name"))
 
@@ -1888,7 +1896,7 @@ test_that("attach() on a DataFrame", {
   stat2 <- summary(age)
   expect_equal(collect(stat2)[5, "age"], "30")
   detach("df")
-  stat3 <- summary(df[, "age"])
+  stat3 <- summary(df[, "age", drop = F])
   expect_equal(collect(stat3)[5, "age"], "30")
   expect_error(age)
 })
@@ -1928,7 +1936,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", {
   df1 <- select(df, cast(df$age, "integer"))
   coltypes(df) <- c("character", "integer")
   expect_equal(dtypes(df), list(c("name", "string"), c("age", "int")))
-  value <- collect(df[, 2])[[3, 1]]
+  value <- collect(df[, 2, drop = F])[[3, 1]]
   expect_equal(value, collect(df1)[[3, 1]])
   expect_equal(value, 22)
author	Oscar D. Lara Yejas <odlaraye@oscars-mbp.usca.ibm.com>	2016-04-27 15:47:54 -0700
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2016-04-27 15:47:54 -0700
commit	e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61 (patch)
tree	58d4303824aca6fec6f9f6311f2dcf7a3cb1bd4e /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent	37575115b98fdc9ebadb2ebcbcd9907a3af1076c (diff)
download	spark-e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61.tar.gz spark-e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61.tar.bz2 spark-e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61.zip