aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorFelix Cheung <felixcheung_m@hotmail.com>2016-06-21 13:36:50 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2016-06-21 13:36:50 -0700
commitdbfdae4e41a900de01b48639d6554d32edbb2e0b (patch)
treee91108581f7ce883c094deafe26bb1bd3cf7f660 /R
parent918c91954fb46400ce2c5ab066d2ec0ae48dda4a (diff)
downloadspark-dbfdae4e41a900de01b48639d6554d32edbb2e0b.tar.gz
spark-dbfdae4e41a900de01b48639d6554d32edbb2e0b.tar.bz2
spark-dbfdae4e41a900de01b48639d6554d32edbb2e0b.zip
[SPARK-16096][SPARKR] add union and deprecate unionAll
## What changes were proposed in this pull request? add union and deprecate unionAll, separate roxygen2 doc for rbind (since their usage and parameter lists are quite different) `explode` is also deprecated - but seems like replacement is a combination of calls; not sure if we should deprecate it in SparkR, yet. ## How was this patch tested? unit tests, manual checks for r doc Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #13805 from felixcheung/runion.
Diffstat (limited to 'R')
-rw-r--r--R/pkg/NAMESPACE1
-rw-r--r--R/pkg/R/DataFrame.R43
-rw-r--r--R/pkg/R/generics.R6
-rw-r--r--R/pkg/inst/tests/testthat/test_context.R2
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R8
5 files changed, 47 insertions, 13 deletions
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ea42888eae..2272d8bdd5 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -107,6 +107,7 @@ exportMethods("arrange",
"summary",
"take",
"transform",
+ "union",
"unionAll",
"unique",
"unpersist",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ed0bb85f43..725cbf24f2 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2251,7 +2251,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
cols
}
-#' rbind
+#' Return a new SparkDataFrame containing the union of rows
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
#' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL.
@@ -2261,39 +2261,64 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
#' @param y A SparkDataFrame
#' @return A SparkDataFrame containing the result of the union.
#' @family SparkDataFrame functions
-#' @rdname rbind
-#' @name unionAll
+#' @rdname union
+#' @name union
+#' @seealso \link{rbind}
#' @export
#' @examples
#'\dontrun{
#' sparkR.session()
#' df1 <- read.json(path)
#' df2 <- read.json(path2)
-#' unioned <- unionAll(df, df2)
+#' unioned <- union(df, df2)
+#' unions <- rbind(df, df2, df3, df4)
#' }
+#' @note union since 2.0.0
+setMethod("union",
+ signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+ function(x, y) {
+ unioned <- callJMethod(x@sdf, "union", y@sdf)
+ dataFrame(unioned)
+ })
+
+#' unionAll is deprecated - use union instead
+#' @rdname union
+#' @name unionAll
+#' @export
#' @note unionAll since 1.4.0
setMethod("unionAll",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
- unioned <- callJMethod(x@sdf, "unionAll", y@sdf)
- dataFrame(unioned)
+ .Deprecated("union")
+ union(x, y)
})
#' Union two or more SparkDataFrames
#'
-#' Returns a new SparkDataFrame containing rows of all parameters.
+#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL.
+#' Note that this does not remove duplicate rows across the two SparkDataFrames.
#'
+#' @param x A SparkDataFrame
+#' @param ... Additional SparkDataFrame
+#' @return A SparkDataFrame containing the result of the union.
+#' @family SparkDataFrame functions
#' @rdname rbind
#' @name rbind
+#' @seealso \link{union}
#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' unions <- rbind(df, df2, df3, df4)
+#' }
#' @note rbind since 1.5.0
setMethod("rbind",
signature(... = "SparkDataFrame"),
function(x, ..., deparse.level = 1) {
if (nargs() == 3) {
- unionAll(x, ...)
+ union(x, ...)
} else {
- unionAll(x, Recall(..., deparse.level = 1))
+ union(x, Recall(..., deparse.level = 1))
}
})
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 7b08a8ee66..27dfd67ffc 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -662,7 +662,11 @@ setGeneric("toJSON", function(x) { standardGeneric("toJSON") })
setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
-#' @rdname rbind
+#' @rdname union
+#' @export
+setGeneric("union", function(x, y) { standardGeneric("union") })
+
+#' @rdname union
#' @export
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
index b149818ff4..3d232df566 100644
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ b/R/pkg/inst/tests/testthat/test_context.R
@@ -24,7 +24,7 @@ test_that("Check masked functions", {
namesOfMaskedCompletely <- c("cov", "filter", "sample")
namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
- "summary", "transform", "drop", "window", "as.data.frame")
+ "summary", "transform", "drop", "window", "as.data.frame", "union")
if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
}
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 7c192fb5a0..9378c7afac 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1590,7 +1590,7 @@ test_that("isLocal()", {
expect_false(isLocal(df))
})
-test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
+test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
df <- read.json(jsonPath)
lines <- c("{\"name\":\"Bob\", \"age\":24}",
@@ -1600,10 +1600,11 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
writeLines(lines, jsonPath2)
df2 <- read.df(jsonPath2, "json")
- unioned <- arrange(unionAll(df, df2), df$age)
+ unioned <- arrange(union(df, df2), df$age)
expect_is(unioned, "SparkDataFrame")
expect_equal(count(unioned), 6)
expect_equal(first(unioned)$name, "Michael")
+ expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
unioned2 <- arrange(rbind(unioned, df, df2), df$age)
expect_is(unioned2, "SparkDataFrame")
@@ -1620,6 +1621,9 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
expect_equal(count(intersected), 1)
expect_equal(first(intersected)$name, "Andy")
+ # Test base::union is working
+ expect_equal(union(c(1:3), c(3:5)), c(1:5))
+
# Test base::rbind is working
expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16)