[SPARK-19818][SPARKR] rbind should check for name consistency of input data frames

## What changes were proposed in this pull request? Added checks for name consistency of input data frames in union. ## How was this patch tested? new test. Author: actuaryzhang <actuaryzhang10@gmail.com> Closes #17159 from actuaryzhang/sparkRUnion.
author: actuaryzhang <actuaryzhang10@gmail.com> 2017-03-06 21:55:11 -0800
committer: Felix Cheung <felixcheung@apache.org> 2017-03-06 21:55:11 -0800
commit: 1f6c090c15f355a0c2aad736f8291fcdee5c556d (patch)
tree: 8e3efe42e7e5fbb3011b9f50c489d43012e1b90f /R
parent: 9909f6d361fdf2b7ef30fa7fbbc91e00f2999794 (diff)
download: spark-1f6c090c15f355a0c2aad736f8291fcdee5c556d.tar.gz
spark-1f6c090c15f355a0c2aad736f8291fcdee5c556d.tar.bz2
spark-1f6c090c15f355a0c2aad736f8291fcdee5c556d.zip
2 files changed, 14 insertions, 1 deletions
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e33d0d8e29..97e0c9edea 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #'
 #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
 #' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
+#' Input SparkDataFrames can have different schemas (names and data types).
 #'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
@@ -2685,7 +2686,8 @@ setMethod("unionAll",
 
 #' Union two or more SparkDataFrames
 #'
-#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
+#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
+#' requires that the input SparkDataFrames have the same column names.
 #'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #'
@@ -2709,6 +2711,10 @@ setMethod("unionAll",
 setMethod("rbind",
           signature(... = "SparkDataFrame"),
           function(x, ..., deparse.level = 1) {
+            nm <- lapply(list(x, ...), names)
+            if (length(unique(nm)) != 1) {
+              stop("Names of input data frames are different.")
+            }
             if (nargs() == 3) {
               union(x, ...)
             } else {
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 7c096597fe..620b633637 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1850,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
   expect_equal(count(unioned2), 12)
   expect_equal(first(unioned2)$name, "Michael")
 
+  df3 <- df2
+  names(df3)[1] <- "newName"
+  expect_error(rbind(df, df3),
+               "Names of input data frames are different.")
+  expect_error(rbind(df, df2, df3),
+               "Names of input data frames are different.")
+
   excepted <- arrange(except(df, df2), desc(df$age))
   expect_is(unioned, "SparkDataFrame")
   expect_equal(count(excepted), 2)
author	actuaryzhang <actuaryzhang10@gmail.com>	2017-03-06 21:55:11 -0800
committer	Felix Cheung <felixcheung@apache.org>	2017-03-06 21:55:11 -0800
commit	1f6c090c15f355a0c2aad736f8291fcdee5c556d (patch)
tree	8e3efe42e7e5fbb3011b9f50c489d43012e1b90f /R
parent	9909f6d361fdf2b7ef30fa7fbbc91e00f2999794 (diff)
download	spark-1f6c090c15f355a0c2aad736f8291fcdee5c556d.tar.gz spark-1f6c090c15f355a0c2aad736f8291fcdee5c556d.tar.bz2 spark-1f6c090c15f355a0c2aad736f8291fcdee5c556d.zip