aboutsummaryrefslogtreecommitdiff
path: root/sql/core/src/test
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-11-06 18:52:05 -0800
committerReynold Xin <rxin@databricks.com>2016-11-06 18:52:05 -0800
commit556a3b7d07f36c29ceb88fb6c24cc229e0e53ee4 (patch)
treeba48a84dc34757cd6691b6a7ce9abdb615c4c1ac /sql/core/src/test
parentb89d0556dff0520ab35882382242fbfa7d9478eb (diff)
downloadspark-556a3b7d07f36c29ceb88fb6c24cc229e0e53ee4.tar.gz
spark-556a3b7d07f36c29ceb88fb6c24cc229e0e53ee4.tar.bz2
spark-556a3b7d07f36c29ceb88fb6c24cc229e0e53ee4.zip
[SPARK-18269][SQL] CSV datasource should read null properly when schema is lager than parsed tokens
## What changes were proposed in this pull request? Currently, there are the three cases when reading CSV by datasource when it is `PERMISSIVE` parse mode. - schema == parsed tokens (from each line) No problem to cast the value in the tokens to the field in the schema as they are equal. - schema < parsed tokens (from each line) It slices the tokens into the number of fields in schema. - schema > parsed tokens (from each line) It appends `null` into parsed tokens so that safely values can be casted with the schema. However, when `null` is appended in the third case, we should take `null` into account when casting the values. In case of `StringType`, it is fine as `UTF8String.fromString(datum)` produces `null` when the input is `null`. Therefore, this case will happen only when schema is explicitly given and schema includes data types that are not `StringType`. The codes below: ```scala val path = "/tmp/a" Seq("1").toDF().write.text(path.getAbsolutePath) val schema = StructType( StructField("a", IntegerType, true) :: StructField("b", IntegerType, true) :: Nil) spark.read.schema(schema).option("header", "false").csv(path).show() ``` prints **Before** ``` java.lang.NumberFormatException: null at java.lang.Integer.parseInt(Integer.java:542) at java.lang.Integer.parseInt(Integer.java:615) at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:272) at scala.collection.immutable.StringOps.toInt(StringOps.scala:29) at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$.castTo(CSVInferSchema.scala:24) ``` **After** ``` +---+----+ | a| b| +---+----+ | 1|null| +---+----+ ``` ## How was this patch tested? Unit test in `CSVSuite.scala` and `CSVTypeCastSuite.scala` Author: hyukjinkwon <gurwls223@gmail.com> Closes #15767 from HyukjinKwon/SPARK-18269.
Diffstat (limited to 'sql/core/src/test')
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala15
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala93
2 files changed, 67 insertions, 41 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 8209b5bd7f..491ff72337 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -890,4 +890,19 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
}
}
+
+ test("load null when the schema is larger than parsed tokens ") {
+ withTempPath { path =>
+ Seq("1").toDF().write.text(path.getAbsolutePath)
+ val schema = StructType(
+ StructField("a", IntegerType, true) ::
+ StructField("b", IntegerType, true) :: Nil)
+ val df = spark.read
+ .schema(schema)
+ .option("header", "false")
+ .csv(path.getAbsolutePath)
+
+ checkAnswer(df, Row(1, null))
+ }
+ }
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
index c74406b9cb..46333d1213 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVTypeCastSuite.scala
@@ -36,7 +36,7 @@ class CSVTypeCastSuite extends SparkFunSuite {
stringValues.zip(decimalValues).foreach { case (strVal, decimalVal) =>
val decimalValue = new BigDecimal(decimalVal.toString)
- assert(CSVTypeCast.castTo(strVal, decimalType) ===
+ assert(CSVTypeCast.castTo(strVal, "_1", decimalType) ===
Decimal(decimalValue, decimalType.precision, decimalType.scale))
}
}
@@ -67,80 +67,91 @@ class CSVTypeCastSuite extends SparkFunSuite {
test("Nullable types are handled") {
assertNull(
- CSVTypeCast.castTo("-", ByteType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", ByteType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", ShortType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", ShortType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", LongType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", LongType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", FloatType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", FloatType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", DoubleType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", BooleanType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", DecimalType.DoubleDecimal, true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", TimestampType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", DateType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", DateType, nullable = true, CSVOptions("nullValue", "-")))
assertNull(
- CSVTypeCast.castTo("-", StringType, nullable = true, CSVOptions("nullValue", "-")))
+ CSVTypeCast.castTo("-", "_1", StringType, nullable = true, CSVOptions("nullValue", "-")))
+ assertNull(
+ CSVTypeCast.castTo(null, "_1", IntegerType, nullable = true, CSVOptions("nullValue", "-")))
+
+ // casting a null to not nullable field should throw an exception.
+ var message = intercept[RuntimeException] {
+ CSVTypeCast.castTo(null, "_1", IntegerType, nullable = false, CSVOptions("nullValue", "-"))
+ }.getMessage
+ assert(message.contains("null value found but field _1 is not nullable."))
+
+ message = intercept[RuntimeException] {
+ CSVTypeCast.castTo("-", "_1", StringType, nullable = false, CSVOptions("nullValue", "-"))
+ }.getMessage
+ assert(message.contains("null value found but field _1 is not nullable."))
}
test("String type should also respect `nullValue`") {
assertNull(
- CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions()))
- assert(
- CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions()) ==
- UTF8String.fromString(""))
+ CSVTypeCast.castTo("", "_1", StringType, nullable = true, CSVOptions()))
assert(
- CSVTypeCast.castTo("", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
+ CSVTypeCast.castTo("", "_1", StringType, nullable = true, CSVOptions("nullValue", "null")) ==
UTF8String.fromString(""))
assert(
- CSVTypeCast.castTo("", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
+ CSVTypeCast.castTo("", "_1", StringType, nullable = false, CSVOptions("nullValue", "null")) ==
UTF8String.fromString(""))
assertNull(
- CSVTypeCast.castTo(null, StringType, nullable = true, CSVOptions("nullValue", "null")))
+ CSVTypeCast.castTo(null, "_1", StringType, nullable = true, CSVOptions("nullValue", "null")))
}
test("Throws exception for empty string with non null type") {
- val exception = intercept[NumberFormatException]{
- CSVTypeCast.castTo("", IntegerType, nullable = false, CSVOptions())
+ val exception = intercept[RuntimeException]{
+ CSVTypeCast.castTo("", "_1", IntegerType, nullable = false, CSVOptions())
}
- assert(exception.getMessage.contains("For input string: \"\""))
+ assert(exception.getMessage.contains("null value found but field _1 is not nullable."))
}
test("Types are cast correctly") {
- assert(CSVTypeCast.castTo("10", ByteType) == 10)
- assert(CSVTypeCast.castTo("10", ShortType) == 10)
- assert(CSVTypeCast.castTo("10", IntegerType) == 10)
- assert(CSVTypeCast.castTo("10", LongType) == 10)
- assert(CSVTypeCast.castTo("1.00", FloatType) == 1.0)
- assert(CSVTypeCast.castTo("1.00", DoubleType) == 1.0)
- assert(CSVTypeCast.castTo("true", BooleanType) == true)
+ assert(CSVTypeCast.castTo("10", "_1", ByteType) == 10)
+ assert(CSVTypeCast.castTo("10", "_1", ShortType) == 10)
+ assert(CSVTypeCast.castTo("10", "_1", IntegerType) == 10)
+ assert(CSVTypeCast.castTo("10", "_1", LongType) == 10)
+ assert(CSVTypeCast.castTo("1.00", "_1", FloatType) == 1.0)
+ assert(CSVTypeCast.castTo("1.00", "_1", DoubleType) == 1.0)
+ assert(CSVTypeCast.castTo("true", "_1", BooleanType) == true)
val timestampsOptions = CSVOptions("timestampFormat", "dd/MM/yyyy hh:mm")
val customTimestamp = "31/01/2015 00:00"
val expectedTime = timestampsOptions.timestampFormat.parse(customTimestamp).getTime
val castedTimestamp =
- CSVTypeCast.castTo(customTimestamp, TimestampType, nullable = true, timestampsOptions)
+ CSVTypeCast.castTo(customTimestamp, "_1", TimestampType, nullable = true, timestampsOptions)
assert(castedTimestamp == expectedTime * 1000L)
val customDate = "31/01/2015"
val dateOptions = CSVOptions("dateFormat", "dd/MM/yyyy")
val expectedDate = dateOptions.dateFormat.parse(customDate).getTime
- val castedDate = CSVTypeCast.castTo(customTimestamp, DateType, nullable = true, dateOptions)
+ val castedDate =
+ CSVTypeCast.castTo(customTimestamp, "_1", DateType, nullable = true, dateOptions)
assert(castedDate == DateTimeUtils.millisToDays(expectedDate))
val timestamp = "2015-01-01 00:00:00"
- assert(CSVTypeCast.castTo(timestamp, TimestampType) ==
+ assert(CSVTypeCast.castTo(timestamp, "_1", TimestampType) ==
DateTimeUtils.stringToTime(timestamp).getTime * 1000L)
- assert(CSVTypeCast.castTo("2015-01-01", DateType) ==
+ assert(CSVTypeCast.castTo("2015-01-01", "_1", DateType) ==
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
}
@@ -148,8 +159,8 @@ class CSVTypeCastSuite extends SparkFunSuite {
val originalLocale = Locale.getDefault
try {
Locale.setDefault(new Locale("fr", "FR"))
- assert(CSVTypeCast.castTo("1,00", FloatType) == 100.0) // Would parse as 1.0 in fr-FR
- assert(CSVTypeCast.castTo("1,00", DoubleType) == 100.0)
+ assert(CSVTypeCast.castTo("1,00", "_1", FloatType) == 100.0) // Would parse as 1.0 in fr-FR
+ assert(CSVTypeCast.castTo("1,00", "_1", DoubleType) == 100.0)
} finally {
Locale.setDefault(originalLocale)
}
@@ -157,7 +168,7 @@ class CSVTypeCastSuite extends SparkFunSuite {
test("Float NaN values are parsed correctly") {
val floatVal: Float = CSVTypeCast.castTo(
- "nn", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float]
+ "nn", "_1", FloatType, nullable = true, CSVOptions("nanValue", "nn")).asInstanceOf[Float]
// Java implements the IEEE-754 floating point standard which guarantees that any comparison
// against NaN will return false (except != which returns true)
@@ -166,32 +177,32 @@ class CSVTypeCastSuite extends SparkFunSuite {
test("Double NaN values are parsed correctly") {
val doubleVal: Double = CSVTypeCast.castTo(
- "-", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double]
+ "-", "_1", DoubleType, nullable = true, CSVOptions("nanValue", "-")).asInstanceOf[Double]
assert(doubleVal.isNaN)
}
test("Float infinite values can be parsed") {
val floatVal1 = CSVTypeCast.castTo(
- "max", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float]
+ "max", "_1", FloatType, nullable = true, CSVOptions("negativeInf", "max")).asInstanceOf[Float]
assert(floatVal1 == Float.NegativeInfinity)
val floatVal2 = CSVTypeCast.castTo(
- "max", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float]
+ "max", "_1", FloatType, nullable = true, CSVOptions("positiveInf", "max")).asInstanceOf[Float]
assert(floatVal2 == Float.PositiveInfinity)
}
test("Double infinite values can be parsed") {
val doubleVal1 = CSVTypeCast.castTo(
- "max", DoubleType, nullable = true, CSVOptions("negativeInf", "max")
+ "max", "_1", DoubleType, nullable = true, CSVOptions("negativeInf", "max")
).asInstanceOf[Double]
assert(doubleVal1 == Double.NegativeInfinity)
val doubleVal2 = CSVTypeCast.castTo(
- "max", DoubleType, nullable = true, CSVOptions("positiveInf", "max")
+ "max", "_1", DoubleType, nullable = true, CSVOptions("positiveInf", "max")
).asInstanceOf[Double]
assert(doubleVal2 == Double.PositiveInfinity)