diff options
author | felixcheung <felixcheung_m@hotmail.com> | 2016-04-23 00:20:27 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-04-23 00:20:27 -0700 |
commit | a55fbe2a16aa0866ff8aca25bf9f772e6eb516a1 (patch) | |
tree | 3c29aa4d17cad1c88f6eb989cae5a207077de689 /R/pkg/inst/tests/testthat/test_sparkSQL.R | |
parent | 86ca8fefc8a147b31952b8a00e58e46d93bb8bc4 (diff) | |
download | spark-a55fbe2a16aa0866ff8aca25bf9f772e6eb516a1.tar.gz spark-a55fbe2a16aa0866ff8aca25bf9f772e6eb516a1.tar.bz2 spark-a55fbe2a16aa0866ff8aca25bf9f772e6eb516a1.zip |
[SPARK-12148][SPARKR] SparkR: rename DataFrame to SparkDataFrame
## What changes were proposed in this pull request?
Changed class name defined in R from "DataFrame" to "SparkDataFrame". A popular package, S4Vector already defines "DataFrame" - this change is to avoid conflict.
Aside from class name and API/roxygen2 references, SparkR APIs like `createDataFrame`, `as.DataFrame` are not changed (S4Vector does not define a "as.DataFrame").
Since in R, one would rarely reference type/class, this change should have minimal/almost-no impact to a SparkR user in terms of back compat.
## How was this patch tested?
SparkR tests, manually loading S4Vector then SparkR package
Author: felixcheung <felixcheung_m@hotmail.com>
Closes #12621 from felixcheung/rdataframe.
Diffstat (limited to 'R/pkg/inst/tests/testthat/test_sparkSQL.R')
-rw-r--r-- | R/pkg/inst/tests/testthat/test_sparkSQL.R | 102 |
1 files changed, 51 insertions, 51 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index b923ccf6bb..9bd3975405 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -101,8 +101,8 @@ test_that("create DataFrame from RDD", { rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) }) df <- createDataFrame(sqlContext, rdd, list("a", "b")) dfAsDF <- as.DataFrame(sqlContext, rdd, list("a", "b")) - expect_is(df, "DataFrame") - expect_is(dfAsDF, "DataFrame") + expect_is(df, "SparkDataFrame") + expect_is(dfAsDF, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(count(dfAsDF), 10) expect_equal(nrow(df), 10) @@ -118,21 +118,21 @@ test_that("create DataFrame from RDD", { df <- createDataFrame(sqlContext, rdd) dfAsDF <- as.DataFrame(sqlContext, rdd) - expect_is(df, "DataFrame") - expect_is(dfAsDF, "DataFrame") + expect_is(df, "SparkDataFrame") + expect_is(dfAsDF, "SparkDataFrame") expect_equal(columns(df), c("_1", "_2")) expect_equal(columns(dfAsDF), c("_1", "_2")) schema <- structType(structField(x = "a", type = "integer", nullable = TRUE), structField(x = "b", type = "string", nullable = TRUE)) df <- createDataFrame(sqlContext, rdd, schema) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) }) df <- createDataFrame(sqlContext, rdd) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) @@ -155,7 +155,7 @@ test_that("create DataFrame from RDD", { age = c(19L, 23L, 18L), height = c(176.5, 181.4, 173.7)) df <- createDataFrame(sqlContext, localDF, schema) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) expect_equal(columns(df), c("name", "age", "height")) expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float"))) @@ -218,25 +218,25 @@ test_that("convert NAs to null type in DataFrames", { test_that("toDF", { rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) }) df <- toDF(rdd, list("a", "b")) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) df <- toDF(rdd) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(columns(df), c("_1", "_2")) schema <- structType(structField(x = "a", type = "integer", nullable = TRUE), structField(x = "b", type = "string", nullable = TRUE)) df <- toDF(rdd, schema) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) }) df <- toDF(rdd) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 10) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) @@ -377,7 +377,7 @@ test_that("Collect DataFrame with complex types", { test_that("read/write json files", { # Test read.df df <- read.df(sqlContext, jsonPath, "json") - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) # Test read.df with a user defined schema @@ -385,17 +385,17 @@ test_that("read/write json files", { structField("age", type = "double")) df1 <- read.df(sqlContext, jsonPath, "json", schema) - expect_is(df1, "DataFrame") + expect_is(df1, "SparkDataFrame") expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double"))) # Test loadDF df2 <- loadDF(sqlContext, jsonPath, "json", schema) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double"))) # Test read.json df <- read.json(sqlContext, jsonPath) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) # Test write.df @@ -408,11 +408,11 @@ test_that("read/write json files", { # Test read.json()/jsonFile() works with multiple input paths jsonDF1 <- read.json(sqlContext, c(jsonPath2, jsonPath3)) - expect_is(jsonDF1, "DataFrame") + expect_is(jsonDF1, "SparkDataFrame") expect_equal(count(jsonDF1), 6) # Suppress warnings because jsonFile is deprecated jsonDF2 <- suppressWarnings(jsonFile(sqlContext, c(jsonPath2, jsonPath3))) - expect_is(jsonDF2, "DataFrame") + expect_is(jsonDF2, "SparkDataFrame") expect_equal(count(jsonDF2), 6) unlink(jsonPath2) @@ -423,12 +423,12 @@ test_that("jsonRDD() on a RDD with json string", { rdd <- parallelize(sc, mockLines) expect_equal(count(rdd), 3) df <- suppressWarnings(jsonRDD(sqlContext, rdd)) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) rdd2 <- flatMap(rdd, function(x) c(x, x)) df <- suppressWarnings(jsonRDD(sqlContext, rdd2)) - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 6) }) @@ -454,7 +454,7 @@ test_that("registerTempTable() results in a queryable table and sql() results in df <- read.json(sqlContext, jsonPath) registerTempTable(df, "table1") newdf <- sql(sqlContext, "SELECT * FROM table1 where name = 'Michael'") - expect_is(newdf, "DataFrame") + expect_is(newdf, "SparkDataFrame") expect_equal(count(newdf), 1) dropTempTable(sqlContext, "table1") }) @@ -493,7 +493,7 @@ test_that("tableToDF() returns a new DataFrame", { df <- read.json(sqlContext, jsonPath) registerTempTable(df, "table1") tabledf <- tableToDF(sqlContext, "table1") - expect_is(tabledf, "DataFrame") + expect_is(tabledf, "SparkDataFrame") expect_equal(count(tabledf), 3) tabledf2 <- tableToDF(sqlContext, "table1") expect_equal(count(tabledf2), 3) @@ -595,7 +595,7 @@ test_that("collect() returns a data.frame", { test_that("limit() returns DataFrame with the correct number of rows", { df <- read.json(sqlContext, jsonPath) dfLimited <- limit(df, 2) - expect_is(dfLimited, "DataFrame") + expect_is(dfLimited, "SparkDataFrame") expect_equal(count(dfLimited), 2) }) @@ -750,11 +750,11 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", { df <- read.json(sqlContext, jsonPathWithDup) uniques <- distinct(df) - expect_is(uniques, "DataFrame") + expect_is(uniques, "SparkDataFrame") expect_equal(count(uniques), 3) uniques2 <- unique(df) - expect_is(uniques2, "DataFrame") + expect_is(uniques2, "SparkDataFrame") expect_equal(count(uniques2), 3) # Test dropDuplicates() @@ -798,7 +798,7 @@ test_that("sample on a DataFrame", { df <- read.json(sqlContext, jsonPath) sampled <- sample(df, FALSE, 1.0) expect_equal(nrow(collect(sampled)), count(df)) - expect_is(sampled, "DataFrame") + expect_is(sampled, "SparkDataFrame") sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result expect_true(count(sampled2) < 3) @@ -822,11 +822,11 @@ test_that("select operators", { expect_is(df[[2]], "Column") expect_is(df[["age"]], "Column") - expect_is(df[, 1], "DataFrame") + expect_is(df[, 1], "SparkDataFrame") expect_equal(columns(df[, 1]), c("name")) expect_equal(columns(df[, "age"]), c("age")) df2 <- df[, c("age", "name")] - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(columns(df2), c("age", "name")) df$age2 <- df$age @@ -890,7 +890,7 @@ test_that("subsetting", { expect_equal(collect(filtered)$name, "Andy") df2 <- df[df$age == 19, 1] - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 1) expect_equal(columns(df2), c("name")) expect_equal(collect(df2)$name, "Justin") @@ -940,7 +940,7 @@ test_that("column calculation", { d <- collect(select(df, alias(df$age + 1, "age2"))) expect_equal(names(d), c("age2")) df2 <- select(df, lower(df$name), abs(df$age)) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) }) @@ -953,30 +953,30 @@ test_that("test HiveContext", { skip("Hive is not build with SparkSQL, skipped") }) df <- createExternalTable(hiveCtx, "json", jsonPath, "json") - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) df2 <- sql(hiveCtx, "select * from json") - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "json2", "json", "append", path = jsonPath2)) df3 <- sql(hiveCtx, "select * from json2") - expect_is(df3, "DataFrame") + expect_is(df3, "SparkDataFrame") expect_equal(count(df3), 3) unlink(jsonPath2) hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "hivetestbl", path = hivetestDataPath)) df4 <- sql(hiveCtx, "select * from hivetestbl") - expect_is(df4, "DataFrame") + expect_is(df4, "SparkDataFrame") expect_equal(count(df4), 3) unlink(hivetestDataPath) parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)) df5 <- sql(hiveCtx, "select * from parquetest") - expect_is(df5, "DataFrame") + expect_is(df5, "SparkDataFrame") expect_equal(count(df5), 3) unlink(parquetDataPath) }) @@ -1272,28 +1272,28 @@ test_that("group by, agg functions", { gd <- groupBy(df, "name") expect_is(gd, "GroupedData") df2 <- count(gd) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(3, count(df2)) # Also test group_by, summarize, mean gd1 <- group_by(df, "name") expect_is(gd1, "GroupedData") df_summarized <- summarize(gd, mean_age = mean(df$age)) - expect_is(df_summarized, "DataFrame") + expect_is(df_summarized, "SparkDataFrame") expect_equal(3, count(df_summarized)) df3 <- agg(gd, age = "stddev") - expect_is(df3, "DataFrame") + expect_is(df3, "SparkDataFrame") df3_local <- collect(df3) expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2])) df4 <- agg(gd, sumAge = sum(df$age)) - expect_is(df4, "DataFrame") + expect_is(df4, "SparkDataFrame") expect_equal(3, count(df4)) expect_equal(columns(df4), c("name", "sumAge")) df5 <- sum(gd, "age") - expect_is(df5, "DataFrame") + expect_is(df5, "SparkDataFrame") expect_equal(3, count(df5)) expect_equal(3, count(mean(gd))) @@ -1521,22 +1521,22 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", { df2 <- read.df(sqlContext, jsonPath2, "json") unioned <- arrange(unionAll(df, df2), df$age) - expect_is(unioned, "DataFrame") + expect_is(unioned, "SparkDataFrame") expect_equal(count(unioned), 6) expect_equal(first(unioned)$name, "Michael") unioned2 <- arrange(rbind(unioned, df, df2), df$age) - expect_is(unioned2, "DataFrame") + expect_is(unioned2, "SparkDataFrame") expect_equal(count(unioned2), 12) expect_equal(first(unioned2)$name, "Michael") excepted <- arrange(except(df, df2), desc(df$age)) - expect_is(unioned, "DataFrame") + expect_is(unioned, "SparkDataFrame") expect_equal(count(excepted), 2) expect_equal(first(excepted)$name, "Justin") intersected <- arrange(intersect(df, df2), df$age) - expect_is(unioned, "DataFrame") + expect_is(unioned, "SparkDataFrame") expect_equal(count(intersected), 1) expect_equal(first(intersected)$name, "Andy") @@ -1601,7 +1601,7 @@ test_that("read/write Parquet files", { # Test write.df and read.df write.df(df, parquetPath, "parquet", mode = "overwrite") df2 <- read.df(sqlContext, parquetPath, "parquet") - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(count(df2), 3) # Test write.parquet/saveAsParquetFile and read.parquet/parquetFile @@ -1610,10 +1610,10 @@ test_that("read/write Parquet files", { parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet") suppressWarnings(saveAsParquetFile(df, parquetPath3)) parquetDF <- read.parquet(sqlContext, c(parquetPath2, parquetPath3)) - expect_is(parquetDF, "DataFrame") + expect_is(parquetDF, "SparkDataFrame") expect_equal(count(parquetDF), count(df) * 2) parquetDF2 <- suppressWarnings(parquetFile(sqlContext, parquetPath2, parquetPath3)) - expect_is(parquetDF2, "DataFrame") + expect_is(parquetDF2, "SparkDataFrame") expect_equal(count(parquetDF2), count(df) * 2) # Test if varargs works with variables @@ -1630,7 +1630,7 @@ test_that("read/write Parquet files", { test_that("read/write text files", { # Test write.df and read.df df <- read.df(sqlContext, jsonPath, "text") - expect_is(df, "DataFrame") + expect_is(df, "SparkDataFrame") expect_equal(colnames(df), c("value")) expect_equal(count(df), 3) textPath <- tempfile(pattern = "textPath", fileext = ".txt") @@ -1640,7 +1640,7 @@ test_that("read/write text files", { textPath2 <- tempfile(pattern = "textPath2", fileext = ".txt") write.text(df, textPath2) df2 <- read.text(sqlContext, c(textPath, textPath2)) - expect_is(df2, "DataFrame") + expect_is(df2, "SparkDataFrame") expect_equal(colnames(df2), c("value")) expect_equal(count(df2), count(df) * 2) @@ -1877,7 +1877,7 @@ test_that("attach() on a DataFrame", { df <- read.json(sqlContext, jsonPath) expect_error(age) attach(df) - expect_is(age, "DataFrame") + expect_is(age, "SparkDataFrame") expected_age <- data.frame(age = c(NA, 30, 19)) expect_equal(head(age), expected_age) stat <- summary(age) @@ -1936,7 +1936,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { expect_equal(dtypes(df), list(c("name", "string"), c("age", "double"))) expect_error(coltypes(df) <- c("character"), - "Length of type vector should match the number of columns for DataFrame") + "Length of type vector should match the number of columns for SparkDataFrame") expect_error(coltypes(df) <- c("environment", "list"), "Only atomic type is supported for column types") }) @@ -1950,7 +1950,7 @@ test_that("Method str()", { out <- capture.output(str(irisDF2)) expect_equal(length(out), 7) - expect_equal(out[1], "'DataFrame': 6 variables:") + expect_equal(out[1], "'SparkDataFrame': 6 variables:") expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4") expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9") expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7") |