diff options
author | Yin Huai <huaiyin.thu@gmail.com> | 2014-07-07 17:05:59 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2014-07-07 17:05:59 -0700 |
commit | f0496ee10847db921a028a34f70385f9b740b3f3 (patch) | |
tree | e7ce39f5c6c6c7ad11fe7a904ee488682430e387 /sql | |
parent | 4deeed17c4847f212a4fa1a8685cfe8a12179263 (diff) | |
download | spark-f0496ee10847db921a028a34f70385f9b740b3f3.tar.gz spark-f0496ee10847db921a028a34f70385f9b740b3f3.tar.bz2 spark-f0496ee10847db921a028a34f70385f9b740b3f3.zip |
[SPARK-2375][SQL] JSON schema inference may not resolve type conflicts correctly for a field inside an array of structs
For example, for
```
{"array": [{"field":214748364700}, {"field":1}]}
```
the type of field is resolved as IntType. While, for
```
{"array": [{"field":1}, {"field":214748364700}]}
```
the type of field is resolved as LongType.
JIRA: https://issues.apache.org/jira/browse/SPARK-2375
Author: Yin Huai <huaiyin.thu@gmail.com>
Closes #1308 from yhuai/SPARK-2375 and squashes the following commits:
3e2e312 [Yin Huai] Update unit test.
1b2ff9f [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2375
10794eb [Yin Huai] Correctly resolve the type of a field inside an array of structs.
Diffstat (limited to 'sql')
3 files changed, 12 insertions, 8 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala index edf8677557..f6cbca9648 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala @@ -198,11 +198,12 @@ private[sql] object JsonRDD extends Logging { * in this JSON object can appear in other JSON objects. */ private def allKeysWithValueTypes(m: Map[String, Any]): Set[(String, DataType)] = { - m.map{ + val keyValuePairs = m.map { // Quote the key with backticks to handle cases which have dots // in the field name. - case (key, dataType) => (s"`$key`", dataType) - }.flatMap { + case (key, value) => (s"`$key`", value) + }.toSet + keyValuePairs.flatMap { case (key: String, struct: Map[String, Any]) => { // The value associted with the key is an JSON object. allKeysWithValueTypes(struct).map { @@ -224,7 +225,7 @@ private[sql] object JsonRDD extends Logging { } } case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil - }.toSet + } } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 10bd9f08f0..e765cfc83a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -451,7 +451,9 @@ class JsonSuite extends QueryTest { val jsonSchemaRDD = jsonRDD(arrayElementTypeConflict) val expectedSchema = - AttributeReference("array", ArrayType(StringType), true)() :: Nil + AttributeReference("array1", ArrayType(StringType), true)() :: + AttributeReference("array2", ArrayType(StructType( + StructField("field", LongType, true) :: Nil)), true)() :: Nil comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output)) @@ -460,12 +462,12 @@ class JsonSuite extends QueryTest { checkAnswer( sql("select * from jsonTable"), Seq(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]", - """{"field":str}""")) :: Nil + """{"field":str}"""), Seq(Seq(214748364700L), Seq(1))) :: Nil ) // Treat an element as a number. checkAnswer( - sql("select array[0] + 1 from jsonTable"), + sql("select array1[0] + 1 from jsonTable"), 2 ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala index 065e04046e..d0180f3754 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala @@ -72,7 +72,8 @@ object TestJsonData { val arrayElementTypeConflict = TestSQLContext.sparkContext.parallelize( - """{"array": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}]}""" :: Nil) + """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}], + "array2": [{"field":214748364700}, {"field":1}]}""" :: Nil) val missingFields = TestSQLContext.sparkContext.parallelize( |