aboutsummaryrefslogtreecommitdiff
path: root/sql/core/src/test/scala
diff options
context:
space:
mode:
authorLiwei Lin <lwlin7@gmail.com>2016-09-18 19:25:58 +0100
committerSean Owen <sowen@cloudera.com>2016-09-18 19:25:58 +0100
commit1dbb725dbef30bf7633584ce8efdb573f2d92bca (patch)
treeca63691ee0b6e70ed661c95743c4c140126bb0e2 /sql/core/src/test/scala
parent7151011b38a841d9d4bc2e453b9a7cfe42f74f8f (diff)
downloadspark-1dbb725dbef30bf7633584ce8efdb573f2d92bca.tar.gz
spark-1dbb725dbef30bf7633584ce8efdb573f2d92bca.tar.bz2
spark-1dbb725dbef30bf7633584ce8efdb573f2d92bca.zip
[SPARK-16462][SPARK-16460][SPARK-15144][SQL] Make CSV cast null values properly
## Problem CSV in Spark 2.0.0: - does not read null values back correctly for certain data types such as `Boolean`, `TimestampType`, `DateType` -- this is a regression comparing to 1.6; - does not read empty values (specified by `options.nullValue`) as `null`s for `StringType` -- this is compatible with 1.6 but leads to problems like SPARK-16903. ## What changes were proposed in this pull request? This patch makes changes to read all empty values back as `null`s. ## How was this patch tested? New test cases. Author: Liwei Lin <lwlin7@gmail.com> Closes #14118 from lw-lin/csv-cast-null.
Diffstat (limited to 'sql/core/src/test/scala')
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala54
2 files changed, 35 insertions, 21 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 1930862118..29aac9def6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -554,7 +554,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
verifyCars(cars, withHeader = true, checkValues = false)
val results = cars.collect()
- assert(results(0).toSeq === Array(2012, "Tesla", "S", "null", "null"))
+ assert(results(0).toSeq === Array(2012, "Tesla", "S", null, null))
assert(results(2).toSeq === Array(null, "Chevy", "Volt", null, null))
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
index 3ce643e667..dae92f626c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
@@ -68,16 +68,46 @@ class CSVTypeCastSuite extends SparkFunSuite {
}
test("Nullable types are handled") {
- assert(CSVTypeCast.castTo("", IntegerType, nullable = true, CSVOptions()) == null)
+ assertNull(
+ CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", DateType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo("-", StringType, nullable = true, CSVOptions("nullValue", "-")))
}
- test("String type should always return the same as the input") {
+ test("String type should also respect `nullValue`") {
+ assertNull(
+ CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()))
assert(
- CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()) ==
+ CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) ==
UTF8String.fromString(""))
+
assert(
- CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) ==
+ CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
+ UTF8String.fromString(""))
+ assert(
+ CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
UTF8String.fromString(""))
+
+ assertNull(
+ CSVTypeCast.castTo(null, StringType, nullable = true, CSVOptions("nullValue", "null")))
}
test("Throws exception for empty string with non null type") {
@@ -170,20 +200,4 @@ class CSVTypeCastSuite extends SparkFunSuite {
assert(doubleVal2 == Double.PositiveInfinity)
}
- test("Type-specific null values are used for casting") {
- assertNull(
- CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
- assertNull(
- CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
- assertNull(
- CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
- assertNull(
- CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
- assertNull(
- CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
- assertNull(
- CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
- assertNull(
- CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
- }
}