diff options
author | NarineK <narine.kokhlikyan@us.ibm.com> | 2016-05-05 12:00:55 -0700 |
---|---|---|
committer | Davies Liu <davies.liu@gmail.com> | 2016-05-05 12:00:55 -0700 |
commit | 22226fcc926f9d3b8aa7b47dcd9847021e6a6879 (patch) | |
tree | 5d3087226563265109c82c9130ff5de6b5eac8b2 /R/pkg/inst/tests/testthat/test_sparkSQL.R | |
parent | ac12b35d31ef1d1663511bf6ae826a9cc0278f20 (diff) | |
download | spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.tar.gz spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.tar.bz2 spark-22226fcc926f9d3b8aa7b47dcd9847021e6a6879.zip |
[SPARK-15110] [SPARKR] Implement repartitionByColumn for SparkR DataFrames
## What changes were proposed in this pull request?
Implement repartitionByColumn on DataFrame.
This will allow us to run R functions on each partition identified by column groups with dapply() method.
## How was this patch tested?
Unit tests
Author: NarineK <narine.kokhlikyan@us.ibm.com>
Closes #12887 from NarineK/repartitionByColumns.
Diffstat (limited to 'R/pkg/inst/tests/testthat/test_sparkSQL.R')
-rw-r--r-- | R/pkg/inst/tests/testthat/test_sparkSQL.R | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 081f7b1663..3b6a27c3b8 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2082,6 +2082,42 @@ test_that("dapply() on a DataFrame", { expect_identical(expected, result) }) +test_that("repartition by columns on DataFrame", { + df <- createDataFrame ( + sqlContext, + list(list(1L, 1, "1", 0.1), list(1L, 2, "2", 0.2), list(3L, 3, "3", 0.3)), + c("a", "b", "c", "d")) + + # no column and number of partitions specified + retError <- tryCatch(repartition(df), error = function(e) e) + expect_equal(grepl + ("Please, specify the number of partitions and/or a column\\(s\\)", retError), TRUE) + + # repartition by column and number of partitions + actual <- repartition(df, 3L, col = df$"a") + + # since we cannot access the number of partitions from dataframe, checking + # that at least the dimensions are identical + expect_identical(dim(df), dim(actual)) + + # repartition by number of partitions + actual <- repartition(df, 13L) + expect_identical(dim(df), dim(actual)) + + # a test case with a column and dapply + schema <- structType(structField("a", "integer"), structField("avg", "double")) + df <- repartition(df, col = df$"a") + df1 <- dapply( + df, + function(x) { + y <- (data.frame(x$a[1], mean(x$b))) + }, + schema) + + # Number of partitions is equal to 2 + expect_equal(nrow(df1), 2) +}) + unlink(parquetPath) unlink(jsonPath) unlink(jsonPathNa) |