aboutsummaryrefslogtreecommitdiff
path: root/R/pkg/inst/tests/testthat/test_sparkSQL.R
diff options
context:
space:
mode:
authorNarine Kokhlikyan <narine@slice.com>2016-07-01 13:55:13 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2016-07-01 13:55:13 -0700
commit26afb4ce4099e7942f8db1ead3817ed8fbf71ce3 (patch)
treea43b0b4dfa9278f8d4f5492b40cfde3c6922c16b /R/pkg/inst/tests/testthat/test_sparkSQL.R
parentc55397652ad1c6d047a8b8eb7fd92a8a1dc66306 (diff)
downloadspark-26afb4ce4099e7942f8db1ead3817ed8fbf71ce3.tar.gz
spark-26afb4ce4099e7942f8db1ead3817ed8fbf71ce3.tar.bz2
spark-26afb4ce4099e7942f8db1ead3817ed8fbf71ce3.zip
[SPARK-16012][SPARKR] Implement gapplyCollect which will apply a R function on each group similar to gapply and collect the result back to R data.frame
## What changes were proposed in this pull request? gapplyCollect() does gapply() on a SparkDataFrame and collect the result back to R. Compared to gapply() + collect(), gapplyCollect() offers performance optimization as well as programming convenience, as no schema is needed to be provided. This is similar to dapplyCollect(). ## How was this patch tested? Added test cases for gapplyCollect similar to dapplyCollect Author: Narine Kokhlikyan <narine@slice.com> Closes #13760 from NarineK/gapplyCollect.
Diffstat (limited to 'R/pkg/inst/tests/testthat/test_sparkSQL.R')
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R35
1 files changed, 30 insertions, 5 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 588c217f3c..3b8d5707f4 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -2257,21 +2257,24 @@ test_that("repartition by columns on DataFrame", {
expect_equal(nrow(df1), 2)
})
-test_that("gapply() on a DataFrame", {
+test_that("gapply() and gapplyCollect() on a DataFrame", {
df <- createDataFrame (
list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
c("a", "b", "c", "d"))
expected <- collect(df)
- df1 <- gapply(df, list("a"), function(key, x) { x }, schema(df))
+ df1 <- gapply(df, "a", function(key, x) { x }, schema(df))
actual <- collect(df1)
expect_identical(actual, expected)
+ df1Collect <- gapplyCollect(df, list("a"), function(key, x) { x })
+ expect_identical(df1Collect, expected)
+
# Computes the sum of second column by grouping on the first and third columns
# and checks if the sum is larger than 2
schema <- structType(structField("a", "integer"), structField("e", "boolean"))
df2 <- gapply(
df,
- list(df$"a", df$"c"),
+ c(df$"a", df$"c"),
function(key, x) {
y <- data.frame(key[1], sum(x$b) > 2)
},
@@ -2280,13 +2283,24 @@ test_that("gapply() on a DataFrame", {
expected <- c(TRUE, TRUE)
expect_identical(actual, expected)
+ df2Collect <- gapplyCollect(
+ df,
+ c(df$"a", df$"c"),
+ function(key, x) {
+ y <- data.frame(key[1], sum(x$b) > 2)
+ colnames(y) <- c("a", "e")
+ y
+ })
+ actual <- df2Collect$e
+ expect_identical(actual, expected)
+
# Computes the arithmetic mean of the second column by grouping
# on the first and third columns. Output the groupping value and the average.
schema <- structType(structField("a", "integer"), structField("c", "string"),
structField("avg", "double"))
df3 <- gapply(
df,
- list("a", "c"),
+ c("a", "c"),
function(key, x) {
y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
},
@@ -2301,11 +2315,22 @@ test_that("gapply() on a DataFrame", {
rownames(expected) <- NULL
expect_identical(actual, expected)
+ df3Collect <- gapplyCollect(
+ df,
+ c("a", "c"),
+ function(key, x) {
+ y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
+ colnames(y) <- c("a", "c", "avg")
+ y
+ })
+ actual <- df3Collect[order(df3Collect$a), ]
+ expect_identical(actual$avg, expected$avg)
+
irisDF <- suppressWarnings(createDataFrame (iris))
schema <- structType(structField("Sepal_Length", "double"), structField("Avg", "double"))
# Groups by `Sepal_Length` and computes the average for `Sepal_Width`
df4 <- gapply(
- cols = list("Sepal_Length"),
+ cols = "Sepal_Length",
irisDF,
function(key, x) {
y <- data.frame(key, mean(x$Sepal_Width), stringsAsFactors = FALSE)