[SPARK-16310][SPARKR] R na.string-like default for csv source

## What changes were proposed in this pull request? Apply default "NA" as null string for R, like R read.csv na.string parameter. https://stat.ethz.ch/R-manual/R-devel/library/utils/html/read.table.html na.strings = "NA" An user passing a csv file with NA value should get the same behavior with SparkR read.df(... source = "csv") (couldn't open JIRA, will do that later) ## How was this patch tested? unit tests shivaram Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #13984 from felixcheung/rcsvnastring.
author: Felix Cheung <felixcheung_m@hotmail.com> 2016-07-07 15:21:57 -0700
committer: Shivaram Venkataraman <shivaram@cs.berkeley.edu> 2016-07-07 15:21:57 -0700
commit: f4767bcc7a9d1bdd301f054776aa45e7c9f344a7 (patch)
tree: bdffd4748c034e0760f0f367e919929899e50c32 /R/pkg/inst
parent: 28710b42b0d18a55bd64d597558649537259b127 (diff)
download: spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.tar.gz
spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.tar.bz2
spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.zip
1 files changed, 26 insertions, 6 deletions
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index a3aa26d9e7..a0ab719202 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -213,15 +213,35 @@ test_that("read csv as DataFrame", {
   mockLinesCsv <- c("year,make,model,comment,blank",
                    "\"2012\",\"Tesla\",\"S\",\"No comment\",",
                    "1997,Ford,E350,\"Go get one now they are going fast\",",
-                   "2015,Chevy,Volt")
+                   "2015,Chevy,Volt",
+                   "NA,Dummy,Placeholder")
   writeLines(mockLinesCsv, csvPath)
 
-  # default "header" is false
-  df <- read.df(csvPath, "csv", header = "true")
-  expect_equal(count(df), 3)
+  # default "header" is false, inferSchema to handle "year" as "int"
+  df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+  expect_equal(count(df), 4)
   expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
-  expect_equal(sort(unlist(collect(where(df, df$year == "2015")))),
-               sort(unlist(list(year = "2015", make = "Chevy", model = "Volt"))))
+  expect_equal(sort(unlist(collect(where(df, df$year == 2015)))),
+               sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"))))
+
+  # since "year" is "int", let's skip the NA values
+  withoutna <- na.omit(df, how = "any", cols = "year")
+  expect_equal(count(withoutna), 3)
+
+  unlink(csvPath)
+  csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+  mockLinesCsv <- c("year,make,model,comment,blank",
+                   "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+                   "1997,Ford,E350,\"Go get one now they are going fast\",",
+                   "2015,Chevy,Volt",
+                   "Empty,Dummy,Placeholder")
+  writeLines(mockLinesCsv, csvPath)
+
+  df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.string = "Empty")
+  expect_equal(count(df2), 4)
+  withoutna2 <- na.omit(df2, how = "any", cols = "year")
+  expect_equal(count(withoutna2), 3)
+  expect_equal(count(where(withoutna2, withoutna2$make == "Dummy")), 0)
 
   unlink(csvPath)
 })
author	Felix Cheung <felixcheung_m@hotmail.com>	2016-07-07 15:21:57 -0700
committer	Shivaram Venkataraman <shivaram@cs.berkeley.edu>	2016-07-07 15:21:57 -0700
commit	f4767bcc7a9d1bdd301f054776aa45e7c9f344a7 (patch)
tree	bdffd4748c034e0760f0f367e919929899e50c32 /R/pkg/inst
parent	28710b42b0d18a55bd64d597558649537259b127 (diff)
download	spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.tar.gz spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.tar.bz2 spark-f4767bcc7a9d1bdd301f054776aa45e7c9f344a7.zip