[SPARK-12337][SPARKR] Implement dropDuplicates() method of DataFrame in SparkR.

Author: Sun Rui <rui.sun@intel.com> Closes #10309 from sun-rui/SPARK-12337.
author: Sun Rui <rui.sun@intel.com> 2016-01-19 16:37:18 -0800
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2016-01-19 16:37:18 -0800
commit: 3ac648289c543b56937d67b5df5c3e228ef47cbd (patch)
tree: 74c38841840ce85970ce4883eec9a2c0cffbc546 /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent: 37fefa66cbd61bc592aba42b0ed3aefc0cf3abb0 (diff)
download: spark-3ac648289c543b56937d67b5df5c3e228ef47cbd.tar.gz
spark-3ac648289c543b56937d67b5df5c3e228ef47cbd.tar.bz2
spark-3ac648289c543b56937d67b5df5c3e228ef47cbd.zip
1 files changed, 37 insertions, 1 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 67ecdbc522..6610734cf4 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -734,7 +734,7 @@ test_that("head() and first() return the correct data", {
   expect_equal(ncol(testFirst), 2)
 })
 
-test_that("distinct() and unique on DataFrames", {
+test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
   lines <- c("{\"name\":\"Michael\"}",
              "{\"name\":\"Andy\", \"age\":30}",
              "{\"name\":\"Justin\", \"age\":19}",
@@ -750,6 +750,42 @@ test_that("distinct() and unique on DataFrames", {
   uniques2 <- unique(df)
   expect_is(uniques2, "DataFrame")
   expect_equal(count(uniques2), 3)
+
+  # Test dropDuplicates()
+  df <- createDataFrame(
+    sqlContext,
+    list(
+      list(2, 1, 2), list(1, 1, 1),
+      list(1, 2, 1), list(2, 1, 2),
+      list(2, 2, 2), list(2, 2, 1),
+      list(2, 1, 1), list(1, 1, 2),
+      list(1, 2, 2), list(1, 2, 1)),
+    schema = c("key", "value1", "value2"))
+  result <- collect(dropDuplicates(df))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(1, 1, 2), c(1, 2, 1),
+    c(1, 2, 2), c(2, 1, 1), c(2, 1, 2),
+    c(2, 2, 1), c(2, 2, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2),],
+    expected)
+
+  result <- collect(dropDuplicates(df, c("key", "value1")))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2),],
+    expected)
+
+  result <- collect(dropDuplicates(df, "key"))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(2, 1, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2),],
+    expected)
 })
 
 test_that("sample on a DataFrame", {
author	Sun Rui <rui.sun@intel.com>	2016-01-19 16:37:18 -0800
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2016-01-19 16:37:18 -0800
commit	3ac648289c543b56937d67b5df5c3e228ef47cbd (patch)
tree	74c38841840ce85970ce4883eec9a2c0cffbc546 /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent	37fefa66cbd61bc592aba42b0ed3aefc0cf3abb0 (diff)
download	spark-3ac648289c543b56937d67b5df5c3e228ef47cbd.tar.gz spark-3ac648289c543b56937d67b5df5c3e228ef47cbd.tar.bz2 spark-3ac648289c543b56937d67b5df5c3e228ef47cbd.zip