[SPARK-11753][SQL][TEST-HADOOP2.2] Make allowNonNumericNumbers option work

## What changes were proposed in this pull request? Jackson suppprts `allowNonNumericNumbers` option to parse non-standard non-numeric numbers such as "NaN", "Infinity", "INF". Currently used Jackson version (2.5.3) doesn't support it all. This patch upgrades the library and make the two ignored tests in `JsonParsingOptionsSuite` passed. ## How was this patch tested? `JsonParsingOptionsSuite`. Author: Liang-Chi Hsieh <simonh@tw.ibm.com> Author: Liang-Chi Hsieh <viirya@appier.com> Closes #9759 from viirya/fix-json-nonnumric.
author: Liang-Chi Hsieh <simonh@tw.ibm.com> 2016-05-24 09:43:39 -0700
committer: Wenchen Fan <wenchen@databricks.com> 2016-05-24 09:43:39 -0700
commit: c24b6b679c3efa053f7de19be73eb36dc70d9930 (patch)
tree: 6a0062ba7892812485dcc01b01e731c61e632ca9 /sql
parent: 6075f5b4d8e98483d26c31576f58e2229024b4f4 (diff)
download: spark-c24b6b679c3efa053f7de19be73eb36dc70d9930.tar.gz
spark-c24b6b679c3efa053f7de19be73eb36dc70d9930.tar.bz2
spark-c24b6b679c3efa053f7de19be73eb36dc70d9930.zip
3 files changed, 62 insertions, 27 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 57a2091fe8..0fed9171a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -293,6 +293,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * </li>
    * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
    * (e.g. 00012)</li>
+   * <li>`allowNonNumericNumbers` (default `true`): allows using non-numeric numbers such as "NaN",
+   * "Infinity", "-Infinity", "INF", "-INF", which are convertd to floating point numbers.</li>
    * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
    * character using backslash quoting mechanism</li>
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index aeee2600a1..cafca32318 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -129,13 +129,15 @@ object JacksonParser extends Logging {
       case (VALUE_STRING, FloatType) =>
         // Special case handling for NaN and Infinity.
         val value = parser.getText
-        val lowerCaseValue = value.toLowerCase()
-        if (lowerCaseValue.equals("nan") ||
-          lowerCaseValue.equals("infinity") ||
-          lowerCaseValue.equals("-infinity") ||
-          lowerCaseValue.equals("inf") ||
-          lowerCaseValue.equals("-inf")) {
+        if (value.equals("NaN") ||
+          value.equals("Infinity") ||
+          value.equals("+Infinity") ||
+          value.equals("-Infinity")) {
           value.toFloat
+        } else if (value.equals("+INF") || value.equals("INF")) {
+          Float.PositiveInfinity
+        } else if (value.equals("-INF")) {
+          Float.NegativeInfinity
         } else {
           throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.")
         }
@@ -146,13 +148,15 @@ object JacksonParser extends Logging {
       case (VALUE_STRING, DoubleType) =>
         // Special case handling for NaN and Infinity.
         val value = parser.getText
-        val lowerCaseValue = value.toLowerCase()
-        if (lowerCaseValue.equals("nan") ||
-          lowerCaseValue.equals("infinity") ||
-          lowerCaseValue.equals("-infinity") ||
-          lowerCaseValue.equals("inf") ||
-          lowerCaseValue.equals("-inf")) {
+        if (value.equals("NaN") ||
+          value.equals("Infinity") ||
+          value.equals("+Infinity") ||
+          value.equals("-Infinity")) {
           value.toDouble
+        } else if (value.equals("+INF") || value.equals("INF")) {
+          Double.PositiveInfinity
+        } else if (value.equals("-INF")) {
+          Double.NegativeInfinity
         } else {
           throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.")
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
index c31dffedbd..2aab955c1e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.json
 
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /**
  * Test cases for various [[JSONOptions]].
@@ -93,23 +94,51 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
     assert(df.first().getLong(0) == 18)
   }
 
-  // The following two tests are not really working - need to look into Jackson's
-  // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS.
-  ignore("allowNonNumericNumbers off") {
-    val str = """{"age": NaN}"""
-    val rdd = spark.sparkContext.parallelize(Seq(str))
-    val df = spark.read.json(rdd)
-
-    assert(df.schema.head.name == "_corrupt_record")
+  test("allowNonNumericNumbers off") {
+    // non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off.
+    var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""",
+      """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": INF}""",
+      """{"age": +INF}""", """{"age": -INF}""")
+    testCases.foreach { str =>
+      val rdd = spark.sparkContext.parallelize(Seq(str))
+      val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd)
+
+      assert(df.schema.head.name == "_corrupt_record")
+    }
+
+    // quoted non-numeric numbers should still work even allowNonNumericNumbers is off.
+    testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "+Infinity"}""",
+      """{"age": "-Infinity"}""", """{"age": "INF"}""", """{"age": "+INF"}""",
+      """{"age": "-INF"}""")
+    val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity,
+      _.isNegInfinity, _.isPosInfinity, _.isPosInfinity, _.isNegInfinity)
+    val schema = StructType(StructField("age", DoubleType, true) :: Nil)
+
+    testCases.zipWithIndex.foreach { case (str, idx) =>
+      val rdd = spark.sparkContext.parallelize(Seq(str))
+      val df = spark.read.option("allowNonNumericNumbers", "false").schema(schema).json(rdd)
+
+      assert(df.schema.head.name == "age")
+      assert(tests(idx)(df.first().getDouble(0)))
+    }
   }
 
-  ignore("allowNonNumericNumbers on") {
-    val str = """{"age": NaN}"""
-    val rdd = spark.sparkContext.parallelize(Seq(str))
-    val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd)
-
-    assert(df.schema.head.name == "age")
-    assert(df.first().getDouble(0).isNaN)
+  test("allowNonNumericNumbers on") {
+    val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""",
+      """{"age": +Infinity}""", """{"age": -Infinity}""", """{"age": +INF}""",
+      """{"age": -INF}""", """{"age": "NaN"}""", """{"age": "Infinity"}""",
+      """{"age": "-Infinity"}""")
+    val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isPosInfinity,
+      _.isNegInfinity, _.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity,
+      _.isNegInfinity, _.isPosInfinity, _.isNegInfinity)
+    val schema = StructType(StructField("age", DoubleType, true) :: Nil)
+    testCases.zipWithIndex.foreach { case (str, idx) =>
+      val rdd = spark.sparkContext.parallelize(Seq(str))
+      val df = spark.read.option("allowNonNumericNumbers", "true").schema(schema).json(rdd)
+
+      assert(df.schema.head.name == "age")
+      assert(tests(idx)(df.first().getDouble(0)))
+    }
   }
 
   test("allowBackslashEscapingAnyCharacter off") {
author	Liang-Chi Hsieh <simonh@tw.ibm.com>	2016-05-24 09:43:39 -0700
committer	Wenchen Fan <wenchen@databricks.com>	2016-05-24 09:43:39 -0700
commit	c24b6b679c3efa053f7de19be73eb36dc70d9930 (patch)
tree	6a0062ba7892812485dcc01b01e731c61e632ca9 /sql
parent	6075f5b4d8e98483d26c31576f58e2229024b4f4 (diff)
download	spark-c24b6b679c3efa053f7de19be73eb36dc70d9930.tar.gz spark-c24b6b679c3efa053f7de19be73eb36dc70d9930.tar.bz2 spark-c24b6b679c3efa053f7de19be73eb36dc70d9930.zip