diff options
author | Dongjoon Hyun <dongjoon@apache.org> | 2016-06-16 20:35:17 -0700 |
---|---|---|
committer | Shivaram Venkataraman <shivaram@cs.berkeley.edu> | 2016-06-16 20:35:17 -0700 |
commit | 513a03e41e27d9c5f70911faccc5d3aecd8bdde9 (patch) | |
tree | 0b030b7d215d09fd12f6ef7274e91923915f2082 /R/pkg | |
parent | 5fd20b66ffe18c05cf257af7f30d32464d2fe8e7 (diff) | |
download | spark-513a03e41e27d9c5f70911faccc5d3aecd8bdde9.tar.gz spark-513a03e41e27d9c5f70911faccc5d3aecd8bdde9.tar.bz2 spark-513a03e41e27d9c5f70911faccc5d3aecd8bdde9.zip |
[SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR
## What changes were proposed in this pull request?
This PR adds varargs-type `dropDuplicates` function to SparkR for API parity.
Refer to https://issues.apache.org/jira/browse/SPARK-15807, too.
## How was this patch tested?
Pass the Jenkins tests with new testcases.
Author: Dongjoon Hyun <dongjoon@apache.org>
Closes #13684 from dongjoon-hyun/SPARK-15908.
Diffstat (limited to 'R/pkg')
-rw-r--r-- | R/pkg/R/DataFrame.R | 25 | ||||
-rw-r--r-- | R/pkg/R/generics.R | 7 | ||||
-rw-r--r-- | R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 |
3 files changed, 29 insertions, 11 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index d72cbbd79e..c710bffa2c 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1936,10 +1936,11 @@ setMethod("where", #' the subset of columns. #' #' @param x A SparkDataFrame. -#' @param colnames A character vector of column names. +#' @param ... A character vector of column names or string column names. +#' If the first argument contains a character vector, the followings are ignored. #' @return A SparkDataFrame with duplicate rows removed. #' @family SparkDataFrame functions -#' @rdname dropduplicates +#' @rdname dropDuplicates #' @name dropDuplicates #' @export #' @examples @@ -1949,14 +1950,26 @@ setMethod("where", #' path <- "path/to/file.json" #' df <- read.json(path) #' dropDuplicates(df) +#' dropDuplicates(df, "col1", "col2") #' dropDuplicates(df, c("col1", "col2")) #' } setMethod("dropDuplicates", signature(x = "SparkDataFrame"), - function(x, colNames = columns(x)) { - stopifnot(class(colNames) == "character") - - sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(colNames)) + function(x, ...) { + cols <- list(...) + if (length(cols) == 0) { + sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(columns(x))) + } else { + if (!all(sapply(cols, function(c) { is.character(c) }))) { + stop("all columns names should be characters") + } + col <- cols[[1]] + if (length(col) > 1) { + sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(col)) + } else { + sdf <- callJMethod(x@sdf, "dropDuplicates", cols) + } + } dataFrame(sdf) }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 40a96d8991..8164e7731a 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -466,12 +466,9 @@ setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) #' @export setGeneric("drop", function(x, ...) { standardGeneric("drop") }) -#' @rdname dropduplicates +#' @rdname dropDuplicates #' @export -setGeneric("dropDuplicates", - function(x, colNames = columns(x)) { - standardGeneric("dropDuplicates") - }) +setGeneric("dropDuplicates", function(x, ...) { standardGeneric("dropDuplicates") }) #' @rdname nafunctions #' @export diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index c11930ada6..11d69366df 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -796,6 +796,14 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", { result[order(result$key, result$value1, result$value2), ], expected) + result <- collect(dropDuplicates(df, "key", "value1")) + expected <- rbind.data.frame( + c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2)) + names(expected) <- c("key", "value1", "value2") + expect_equivalent( + result[order(result$key, result$value1, result$value2), ], + expected) + result <- collect(dropDuplicates(df, "key")) expected <- rbind.data.frame( c(1, 1, 1), c(2, 1, 2)) |