aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Cheung <felixcheung_m@hotmail.com>2016-07-07 15:21:57 -0700
committerShivaram Venkataraman <shivaram@cs.berkeley.edu>2016-07-07 15:21:57 -0700
commitf4767bcc7a9d1bdd301f054776aa45e7c9f344a7 (patch)
treebdffd4748c034e0760f0f367e919929899e50c32
parent28710b42b0d18a55bd64d597558649537259b127 (diff)
downloadspark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.tar.gz
spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.tar.bz2
spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.zip
[SPARK-16310][SPARKR] R na.string-like default for csv source
## What changes were proposed in this pull request? Apply default "NA" as null string for R, like R read.csv na.string parameter. https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html na.strings = "NA" An user passing a csv file with NA value should get the same behavior with SparkR read.df(... source = "csv") (couldn't open JIRA, will do that later) ## How was this patch tested? unit tests shivaram Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #13984 from felixcheung/rcsvnastring.
-rw-r--r--R/pkg/R/SQLContext.R10
-rw-r--r--R/pkg/inst/tests/testthat/test_sparkSQL.R32
2 files changed, 34 insertions, 8 deletions
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 8df73db36e..bc0daa25c9 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -714,11 +714,14 @@ dropTempView <- function(viewName) {
#'
#' The data source is specified by the `source` and a set of options(...).
#' If `source` is not specified, the default data source configured by
-#' "spark.sql.sources.default" will be used.
+#' "spark.sql.sources.default" will be used. \cr
+#' Similar to R read.csv, when `source` is "csv", by default, a value of "NA" will be interpreted
+#' as NA.
#'
#' @param path The path of files to load
#' @param source The name of external data source
#' @param schema The data schema defined in structType
+#' @param na.strings Default string value for NA when source is "csv"
#' @return SparkDataFrame
#' @rdname read.df
#' @name read.df
@@ -735,7 +738,7 @@ dropTempView <- function(viewName) {
#' @name read.df
#' @method read.df default
#' @note read.df since 1.4.0
-read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) {
+read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) {
sparkSession <- getSparkSession()
options <- varargsToEnv(...)
if (!is.null(path)) {
@@ -744,6 +747,9 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, ...) {
if (is.null(source)) {
source <- getDefaultSqlSource()
}
+ if (source == "csv" && is.null(options[["nullValue"]])) {
+ options[["nullValue"]] <- na.strings
+ }
if (!is.null(schema)) {
stopifnot(class(schema) == "structType")
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession, source,
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index a3aa26d9e7..a0ab719202 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -213,15 +213,35 @@ test_that("read csv as DataFrame", {
mockLinesCsv <- c("year,make,model,comment,blank",
"\"2012\",\"Tesla\",\"S\",\"No comment\",",
"1997,Ford,E350,\"Go get one now they are going fast\",",
- "2015,Chevy,Volt")
+ "2015,Chevy,Volt",
+ "NA,Dummy,Placeholder")
writeLines(mockLinesCsv, csvPath)
- # default "header" is false
- df <- read.df(csvPath, "csv", header = "true")
- expect_equal(count(df), 3)
+ # default "header" is false, inferSchema to handle "year" as "int"
+ df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+ expect_equal(count(df), 4)
expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
- expect_equal(sort(unlist(collect(where(df, df$year == "2015")))),
- sort(unlist(list(year = "2015", make = "Chevy", model = "Volt"))))
+ expect_equal(sort(unlist(collect(where(df, df$year == 2015)))),
+ sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"))))
+
+ # since "year" is "int", let's skip the NA values
+ withoutna <- na.omit(df, how = "any", cols = "year")
+ expect_equal(count(withoutna), 3)
+
+ unlink(csvPath)
+ csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+ mockLinesCsv <- c("year,make,model,comment,blank",
+ "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+ "1997,Ford,E350,\"Go get one now they are going fast\",",
+ "2015,Chevy,Volt",
+ "Empty,Dummy,Placeholder")
+ writeLines(mockLinesCsv, csvPath)
+
+ df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.string = "Empty")
+ expect_equal(count(df2), 4)
+ withoutna2 <- na.omit(df2, how = "any", cols = "year")
+ expect_equal(count(withoutna2), 3)
+ expect_equal(count(where(withoutna2, withoutna2$make == "Dummy")), 0)
unlink(csvPath)
})