[SPARK-15110] [SPARKR] Implement repartitionByColumn for SparkR DataFrames

## What changes were proposed in this pull request? Implement repartitionByColumn on DataFrame. This will allow us to run R functions on each partition identified by column groups with dapply() method. ## How was this patch tested? Unit tests Author: NarineK <narine.kokhlikyan@us.ibm.com> Closes #12887 from NarineK/repartitionByColumns.
author: NarineK <narine.kokhlikyan@us.ibm.com> 2016-05-05 12:00:55 -0700
committer: Davies Liu <davies.liu@gmail.com> 2016-05-05 12:00:55 -0700
commit: 22226fcc926f9d3b8aa7b47dcd9847021e6a6879 (patch)
tree: 5d3087226563265109c82c9130ff5de6b5eac8b2 /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent: ac12b35d31ef1d1663511bf6ae826a9cc0278f20 (diff)
download: spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.tar.gz
spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.tar.bz2
spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.zip
1 files changed, 36 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 081f7b1663..3b6a27c3b8 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2082,6 +2082,42 @@ test_that("dapply() on a DataFrame", {
   expect_identical(expected, result)
 })
 
+test_that("repartition by columns on DataFrame", {
+  df <- createDataFrame (
+    sqlContext,
+    list(list(1L, 1, "1", 0.1), list(1L, 2, "2", 0.2), list(3L, 3, "3", 0.3)),
+    c("a", "b", "c", "d"))
+
+  # no column and number of partitions specified
+  retError <- tryCatch(repartition(df), error = function(e) e)
+  expect_equal(grepl
+    ("Please, specify the number of partitions and/or a column\\(s\\)", retError), TRUE)
+
+  # repartition by column and number of partitions
+  actual <- repartition(df, 3L, col = df$"a")
+
+  # since we cannot access the number of partitions from dataframe, checking
+  # that at least the dimensions are identical
+  expect_identical(dim(df), dim(actual))
+
+  # repartition by number of partitions
+  actual <- repartition(df, 13L)
+  expect_identical(dim(df), dim(actual))
+
+  # a test case with a column and dapply
+  schema <-  structType(structField("a", "integer"), structField("avg", "double"))
+  df <- repartition(df, col = df$"a")
+  df1 <- dapply(
+    df,
+    function(x) {
+      y <- (data.frame(x$a[1], mean(x$b)))
+    },
+    schema)
+
+  # Number of partitions is equal to 2
+  expect_equal(nrow(df1), 2)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)
author	NarineK <narine.kokhlikyan@us.ibm.com>	2016-05-05 12:00:55 -0700
committer	Davies Liu <davies.liu@gmail.com>	2016-05-05 12:00:55 -0700
commit	22226fcc926f9d3b8aa7b47dcd9847021e6a6879 (patch)
tree	5d3087226563265109c82c9130ff5de6b5eac8b2 /R/pkg/inst/tests/testthat/test_sparkSQL.R
parent	ac12b35d31ef1d1663511bf6ae826a9cc0278f20 (diff)
download	spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.tar.gz spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.tar.bz2 spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.zip