[SPARK-2375][SQL] JSON schema inference may not resolve type conflicts correctly for a field inside an array of structs

For example, for ``` {"array": [{"field":214748364700}, {"field":1}]} ``` the type of field is resolved as IntType. While, for ``` {"array": [{"field":1}, {"field":214748364700}]} ``` the type of field is resolved as LongType. JIRA: https://issues.apache.org/jira/browse/SPARK-2375 Author: Yin Huai <huaiyin.thu@gmail.com> Closes #1308 from yhuai/SPARK-2375 and squashes the following commits: 3e2e312 [Yin Huai] Update unit test. 1b2ff9f [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2375 10794eb [Yin Huai] Correctly resolve the type of a field inside an array of structs.
author: Yin Huai <huaiyin.thu@gmail.com> 2014-07-07 17:05:59 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-07-07 17:05:59 -0700
commit: f0496ee10847db921a028a34f70385f9b740b3f3 (patch)
tree: e7ce39f5c6c6c7ad11fe7a904ee488682430e387 /sql
parent: 4deeed17c4847f212a4fa1a8685cfe8a12179263 (diff)
download: spark-f0496ee10847db921a028a34f70385f9b740b3f3.tar.gz
spark-f0496ee10847db921a028a34f70385f9b740b3f3.tar.bz2
spark-f0496ee10847db921a028a34f70385f9b740b3f3.zip
3 files changed, 12 insertions, 8 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index edf8677557..f6cbca9648 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -198,11 +198,12 @@ private[sql] object JsonRDD extends Logging {
    * in this JSON object can appear in other JSON objects.
    */
   private def allKeysWithValueTypes(m: Map[String, Any]): Set[(String, DataType)] = {
-    m.map{
+    val keyValuePairs = m.map {
       // Quote the key with backticks to handle cases which have dots
       // in the field name.
-      case (key, dataType) => (s"`$key`", dataType)
-    }.flatMap {
+      case (key, value) => (s"`$key`", value)
+    }.toSet
+    keyValuePairs.flatMap {
       case (key: String, struct: Map[String, Any]) => {
         // The value associted with the key is an JSON object.
         allKeysWithValueTypes(struct).map {
@@ -224,7 +225,7 @@ private[sql] object JsonRDD extends Logging {
         }
       }
       case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
-    }.toSet
+    }
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 10bd9f08f0..e765cfc83a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -451,7 +451,9 @@ class JsonSuite extends QueryTest {
     val jsonSchemaRDD = jsonRDD(arrayElementTypeConflict)
 
     val expectedSchema =
-      AttributeReference("array", ArrayType(StringType), true)() :: Nil
+      AttributeReference("array1", ArrayType(StringType), true)() ::
+      AttributeReference("array2", ArrayType(StructType(
+        StructField("field", LongType, true) :: Nil)), true)() :: Nil
 
     comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
 
@@ -460,12 +462,12 @@ class JsonSuite extends QueryTest {
     checkAnswer(
       sql("select * from jsonTable"),
       Seq(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]",
-        """{"field":str}""")) :: Nil
+        """{"field":str}"""), Seq(Seq(214748364700L), Seq(1))) :: Nil
     )
 
     // Treat an element as a number.
     checkAnswer(
-      sql("select array[0] + 1 from jsonTable"),
+      sql("select array1[0] + 1 from jsonTable"),
       2
     )
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index 065e04046e..d0180f3754 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -72,7 +72,8 @@ object TestJsonData {
 
   val arrayElementTypeConflict =
     TestSQLContext.sparkContext.parallelize(
-      """{"array": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}]}""" :: Nil)
+      """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
+          "array2": [{"field":214748364700}, {"field":1}]}""" :: Nil)
 
   val missingFields =
     TestSQLContext.sparkContext.parallelize(
author	Yin Huai <huaiyin.thu@gmail.com>	2014-07-07 17:05:59 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-07-07 17:05:59 -0700
commit	f0496ee10847db921a028a34f70385f9b740b3f3 (patch)
tree	e7ce39f5c6c6c7ad11fe7a904ee488682430e387 /sql
parent	4deeed17c4847f212a4fa1a8685cfe8a12179263 (diff)
download	spark-f0496ee10847db921a028a34f70385f9b740b3f3.tar.gz spark-f0496ee10847db921a028a34f70385f9b740b3f3.tar.bz2 spark-f0496ee10847db921a028a34f70385f9b740b3f3.zip