aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--R/pkg/R/DataFrame.R25
-rw-r--r--R/pkg/R/generics.R7
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R8
3 files changed, 29 insertions, 11 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index d72cbbd79e..c710bffa2c 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1936,10 +1936,11 @@ setMethod("where",
#' the subset of columns.
#'
#' @param x A SparkDataFrame.
-#' @param colnames A character vector of column names.
+#' @param ... A character vector of column names or string column names.
+#' If the first argument contains a character vector, the followings are ignored.
#' @return A SparkDataFrame with duplicate rows removed.
#' @family SparkDataFrame functions
-#' @rdname dropduplicates
+#' @rdname dropDuplicates
#' @name dropDuplicates
#' @export
#' @examples
@@ -1949,14 +1950,26 @@ setMethod("where",
#' path <- "path/to/file.json"
#' df <- read.json(path)
#' dropDuplicates(df)
+#' dropDuplicates(df, "col1", "col2")
#' dropDuplicates(df, c("col1", "col2"))
#' }
setMethod("dropDuplicates",
signature(x = "SparkDataFrame"),
- function(x, colNames = columns(x)) {
- stopifnot(class(colNames) == "character")
-
- sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(colNames))
+ function(x, ...) {
+ cols <- list(...)
+ if (length(cols) == 0) {
+ sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(columns(x)))
+ } else {
+ if (!all(sapply(cols, function(c) { is.character(c) }))) {
+ stop("all columns names should be characters")
+ }
+ col <- cols[[1]]
+ if (length(col) > 1) {
+ sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(col))
+ } else {
+ sdf <- callJMethod(x@sdf, "dropDuplicates", cols)
+ }
+ }
dataFrame(sdf)
})
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 40a96d8991..8164e7731a 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -466,12 +466,9 @@ setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
#' @export
setGeneric("drop", function(x, ...) { standardGeneric("drop") })
-#' @rdname dropduplicates
+#' @rdname dropDuplicates
#' @export
-setGeneric("dropDuplicates",
- function(x, colNames = columns(x)) {
- standardGeneric("dropDuplicates")
- })
+setGeneric("dropDuplicates", function(x, ...) { standardGeneric("dropDuplicates") })
#' @rdname nafunctions
#' @export
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index c11930ada6..11d69366df 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -796,6 +796,14 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
result[order(result$key, result$value1, result$value2), ],
expected)
+ result <- collect(dropDuplicates(df, "key", "value1"))
+ expected <- rbind.data.frame(
+ c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+ names(expected) <- c("key", "value1", "value2")
+ expect_equivalent(
+ result[order(result$key, result$value1, result$value2), ],
+ expected)
+
result <- collect(dropDuplicates(df, "key"))
expected <- rbind.data.frame(
c(1, 1, 1), c(2, 1, 2))