From e4bfb4aa7382cb9c5e4eb7e2211551d5da716a61 Mon Sep 17 00:00:00 2001 From: "Oscar D. Lara Yejas" Date: Wed, 27 Apr 2016 15:47:54 -0700 Subject: [SPARK-13436][SPARKR] Added parameter drop to subsetting operator [ Added parameter drop to subsetting operator [. This is useful to get a Column from a DataFrame, given its name. R supports it. In R: ``` > name <- "Sepal_Length" > class(iris[, name]) [1] "numeric" ``` Currently, in SparkR: ``` > name <- "Sepal_Length" > class(irisDF[, name]) [1] "DataFrame" ``` Previous code returns a DataFrame, which is inconsistent with R's behavior. SparkR should return a Column instead. Currently, in order for the user to return a Column given a column name as a character variable would be through `eval(parse(x))`, where x is the string `"irisDF$Sepal_Length"`. That itself is pretty hacky. `SparkR:::getColumn() `is another choice, but I don't see why this method should be externalized. Instead, following R's way to do things, the proposed implementation allows this: ``` > name <- "Sepal_Length" > class(irisDF[, name, drop=T]) [1] "Column" > class(irisDF[, name, drop=F]) [1] "DataFrame" ``` This is consistent with R: ``` > name <- "Sepal_Length" > class(iris[, name]) [1] "numeric" > class(iris[, name, drop=F]) [1] "data.frame" ``` Author: Oscar D. Lara Yejas Author: Oscar D. Lara Yejas Closes #11318 from olarayej/SPARK-13436. --- R/pkg/R/DataFrame.R | 70 ++++++++++++++++--------------- R/pkg/R/utils.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 24 +++++++---- 3 files changed, 54 insertions(+), 42 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 36aedfae86..48ac1b06f6 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1237,29 +1237,38 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"), #' @rdname subset #' @name [ -setMethod("[", signature(x = "SparkDataFrame", i = "missing"), - function(x, i, j, ...) { - if (is.numeric(j)) { - cols <- columns(x) - j <- cols[j] - } - if (length(j) > 1) { - j <- as.list(j) +setMethod("[", signature(x = "SparkDataFrame"), + function(x, i, j, ..., drop = F) { + # Perform filtering first if needed + filtered <- if (missing(i)) { + x + } else { + if (class(i) != "Column") { + stop(paste0("Expressions other than filtering predicates are not supported ", + "in the first parameter of extract operator [ or subset() method.")) + } + filter(x, i) } - select(x, j) - }) -#' @rdname subset -#' @name [ -setMethod("[", signature(x = "SparkDataFrame", i = "Column"), - function(x, i, j, ...) { - # It could handle i as "character" but it seems confusing and not required - # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html - filtered <- filter(x, i) - if (!missing(j)) { - filtered[, j, ...] - } else { + # If something is to be projected, then do so on the filtered SparkDataFrame + if (missing(j)) { filtered + } else { + if (is.numeric(j)) { + cols <- columns(filtered) + j <- cols[j] + } + if (length(j) > 1) { + j <- as.list(j) + } + selected <- select(filtered, j) + + # Acknowledge parameter drop. Return a Column or SparkDataFrame accordingly + if (ncol(selected) == 1 & drop == T) { + getColumn(selected, names(selected)) + } else { + selected + } } }) @@ -1268,10 +1277,10 @@ setMethod("[", signature(x = "SparkDataFrame", i = "Column"), #' Return subsets of SparkDataFrame according to given conditions #' @param x A SparkDataFrame #' @param subset (Optional) A logical expression to filter on rows -#' @param select expression for the single Column or a list of columns to select from the -#' SparkDataFrame -#' @return A new SparkDataFrame containing only the rows that meet the condition with selected -#' columns +#' @param select expression for the single Column or a list of columns to select from the SparkDataFrame +#' @param drop if TRUE, a Column will be returned if the resulting dataset has only one column. +#' Otherwise, a SparkDataFrame will always be returned. +#' @return A new SparkDataFrame containing only the rows that meet the condition with selected columns #' @export #' @family SparkDataFrame functions #' @rdname subset @@ -1293,12 +1302,8 @@ setMethod("[", signature(x = "SparkDataFrame", i = "Column"), #' subset(df, select = c(1,2)) #' } setMethod("subset", signature(x = "SparkDataFrame"), - function(x, subset, select, ...) { - if (missing(subset)) { - x[, select, ...] - } else { - x[subset, select, ...] - } + function(x, subset, select, drop = F, ...) { + x[subset, select, drop = drop] }) #' Select @@ -2520,7 +2525,7 @@ setMethod("histogram", } # Filter NA values in the target column and remove all other columns - df <- na.omit(df[, colname]) + df <- na.omit(df[, colname, drop = F]) getColumn(df, colname) } else if (class(col) == "Column") { @@ -2552,8 +2557,7 @@ setMethod("histogram", col } - # At this point, df only has one column: the one to compute the histogram from - stats <- collect(describe(df[, colname])) + stats <- collect(describe(df[, colname, drop = F])) min <- as.numeric(stats[4, 2]) max <- as.numeric(stats[5, 2]) diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index ba7a611e64..bf67e231d5 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -632,7 +632,7 @@ assignNewEnv <- function(data) { env <- new.env() for (i in 1:length(cols)) { - assign(x = cols[i], value = data[, cols[i]], envir = env) + assign(x = cols[i], value = data[, cols[i], drop = F], envir = env) } env } diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 336068035e..95d6cb8875 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -822,9 +822,10 @@ test_that("select operators", { expect_is(df[[2]], "Column") expect_is(df[["age"]], "Column") - expect_is(df[, 1], "SparkDataFrame") - expect_equal(columns(df[, 1]), c("name")) - expect_equal(columns(df[, "age"]), c("age")) + expect_is(df[, 1, drop = F], "SparkDataFrame") + expect_equal(columns(df[, 1, drop = F]), c("name")) + expect_equal(columns(df[, "age", drop = F]), c("age")) + df2 <- df[, c("age", "name")] expect_is(df2, "SparkDataFrame") expect_equal(columns(df2), c("age", "name")) @@ -835,6 +836,13 @@ test_that("select operators", { df$age2 <- df$age * 2 expect_equal(columns(df), c("name", "age", "age2")) expect_equal(count(where(df, df$age2 == df$age * 2)), 2) + + # Test parameter drop + expect_equal(class(df[, 1]) == "SparkDataFrame", T) + expect_equal(class(df[, 1, drop = T]) == "Column", T) + expect_equal(class(df[, 1, drop = F]) == "SparkDataFrame", T) + expect_equal(class(df[df$age > 4, 2, drop = T]) == "Column", T) + expect_equal(class(df[df$age > 4, 2, drop = F]) == "SparkDataFrame", T) }) test_that("select with column", { @@ -889,13 +897,13 @@ test_that("subsetting", { expect_equal(columns(filtered), c("name", "age")) expect_equal(collect(filtered)$name, "Andy") - df2 <- df[df$age == 19, 1] + df2 <- df[df$age == 19, 1, drop = F] expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) expect_equal(collect(df2)$name, "Justin") - df3 <- df[df$age > 20, 2] + df3 <- df[df$age > 20, 2, drop = F] expect_equal(count(df3), 1) expect_equal(columns(df3), c("age")) @@ -911,7 +919,7 @@ test_that("subsetting", { expect_equal(count(df6), 1) expect_equal(columns(df6), c("name", "age")) - df7 <- subset(df, select = "name") + df7 <- subset(df, select = "name", drop = F) expect_equal(count(df7), 3) expect_equal(columns(df7), c("name")) @@ -1888,7 +1896,7 @@ test_that("attach() on a DataFrame", { stat2 <- summary(age) expect_equal(collect(stat2)[5, "age"], "30") detach("df") - stat3 <- summary(df[, "age"]) + stat3 <- summary(df[, "age", drop = F]) expect_equal(collect(stat3)[5, "age"], "30") expect_error(age) }) @@ -1928,7 +1936,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { df1 <- select(df, cast(df$age, "integer")) coltypes(df) <- c("character", "integer") expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"))) - value <- collect(df[, 2])[[3, 1]] + value <- collect(df[, 2, drop = F])[[3, 1]] expect_equal(value, collect(df1)[[3, 1]]) expect_equal(value, 22) -- cgit v1.2.3