[SPARK-19488][SQL] fix csv infer schema when the field is Nan/Inf etc

## What changes were proposed in this pull request? when csv infer schema, it does not use user defined csvoptions to parse the field, such as `inf`, `-inf` which are should be parsed to DoubleType this pr add `options.nanValue`, `options.negativeInf`, `options.positiveIn` to check if the field is a DoubleType ## How was this patch tested? unit test added Author: windpiger <songjun@outlook.com> Closes #16834 from windpiger/fixinferInfSchemaCsv.
author: windpiger <songjun@outlook.com> 2017-02-08 14:30:28 +0800
committer: Wenchen Fan <wenchen@databricks.com> 2017-02-08 14:30:28 +0800
commit: d60dde26f98164ae146da1b5f409f4eb7c3621aa (patch)
tree: 477654049b435a3aefd5bd1e8e0a997de47b6c23 /sql/core/src/main/scala/org
parent: 5a0569ce693c635c5fa12b2de33ed3643ce888e3 (diff)
download: spark-d60dde26f98164ae146da1b5f409f4eb7c3621aa.tar.gz
spark-d60dde26f98164ae146da1b5f409f4eb7c3621aa.tar.bz2
spark-d60dde26f98164ae146da1b5f409f4eb7c3621aa.zip
1 files changed, 5 insertions, 1 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index 485b186c7c..3fa30fe240 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -150,6 +150,10 @@ private[csv] object CSVInferSchema {
     }
   }
 
+  private def isInfOrNan(field: String, options: CSVOptions): Boolean = {
+    field == options.nanValue || field == options.negativeInf || field == options.positiveInf
+  }
+
   private def tryParseInteger(field: String, options: CSVOptions): DataType = {
     if ((allCatch opt field.toInt).isDefined) {
       IntegerType
@@ -185,7 +189,7 @@ private[csv] object CSVInferSchema {
   }
 
   private def tryParseDouble(field: String, options: CSVOptions): DataType = {
-    if ((allCatch opt field.toDouble).isDefined) {
+    if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field, options)) {
       DoubleType
     } else {
       tryParseTimestamp(field, options)
author	windpiger <songjun@outlook.com>	2017-02-08 14:30:28 +0800
committer	Wenchen Fan <wenchen@databricks.com>	2017-02-08 14:30:28 +0800
commit	d60dde26f98164ae146da1b5f409f4eb7c3621aa (patch)
tree	477654049b435a3aefd5bd1e8e0a997de47b6c23 /sql/core/src/main/scala/org
parent	5a0569ce693c635c5fa12b2de33ed3643ce888e3 (diff)
download	spark-d60dde26f98164ae146da1b5f409f4eb7c3621aa.tar.gz spark-d60dde26f98164ae146da1b5f409f4eb7c3621aa.tar.bz2 spark-d60dde26f98164ae146da1b5f409f4eb7c3621aa.zip