From 8d0d2a65eb3a7b1865f7fa7cc18b146fc6474620 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Wed, 17 Dec 2014 15:01:59 -0800 Subject: [SPARK-4856] [SQL] NullType instead of StringType when sampling against empty string or nul... ``` TestSQLContext.sparkContext.parallelize( """{"ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" :: """{"ip":"27.31.100.29","headers":{}}""" :: """{"ip":"27.31.100.29","headers":""}""" :: Nil) ``` As empty string (the "headers") will be considered as String in the beginning (in line 2 and 3), it ignores the real nested data type (struct type "headers" in line 1), and also take the line 1 (the "headers") as String Type, which is not our expected. Author: Cheng Hao Closes #3708 from chenghao-intel/json and squashes the following commits: e7a72e9 [Cheng Hao] add more concise unit test 853de51 [Cheng Hao] NullType instead of StringType when sampling against empty string or null value --- .../scala/org/apache/spark/sql/json/JsonRDD.scala | 4 +++- .../scala/org/apache/spark/sql/json/JsonSuite.scala | 19 +++++++++++++++++++ .../org/apache/spark/sql/json/TestJsonData.scala | 7 +++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala index ffb9548356..00449c2007 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala @@ -263,6 +263,8 @@ private[sql] object JsonRDD extends Logging { val elementType = typeOfArray(array) buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType) } + // we couldn't tell what the type is if the value is null or empty string + case (key: String, value) if value == "" || value == null => (key, NullType) :: Nil case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil } } @@ -400,13 +402,13 @@ private[sql] object JsonRDD extends Logging { } else { desiredType match { case StringType => toString(value) + case _ if value == null || value == "" => null // guard the non string type case IntegerType => value.asInstanceOf[IntegerType.JvmType] case LongType => toLong(value) case DoubleType => toDouble(value) case DecimalType() => toDecimal(value) case BooleanType => value.asInstanceOf[BooleanType.JvmType] case NullType => null - case ArrayType(elementType, _) => value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType)) case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index f088d41325..8dce3372a8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -193,6 +193,25 @@ class JsonSuite extends QueryTest { StringType) } + test("Complex field and type inferring with null in sampling") { + val jsonSchemaRDD = jsonRDD(jsonNullStruct) + val expectedSchema = StructType( + StructField("headers", StructType( + StructField("Charset", StringType, true) :: + StructField("Host", StringType, true) :: Nil) + , true) :: + StructField("ip", StringType, true) :: + StructField("nullstr", StringType, true):: Nil) + + assert(expectedSchema === jsonSchemaRDD.schema) + jsonSchemaRDD.registerTempTable("jsonTable") + + checkAnswer( + sql("select nullstr, headers.Host from jsonTable"), + Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null)) + ) + } + test("Primitive field and type inferring") { val jsonSchemaRDD = jsonRDD(primitiveFieldAndType) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala index e5773a5587..3370b3c98b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala @@ -43,6 +43,13 @@ object TestJsonData { """{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470, "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil) + val jsonNullStruct = + TestSQLContext.sparkContext.parallelize( + """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" :: + """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" :: + """{"nullstr":"","ip":"27.31.100.29","headers":""}""" :: + """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil) + val complexFieldValueTypeConflict = TestSQLContext.sparkContext.parallelize( """{"num_struct":11, "str_array":[1, 2, 3], -- cgit v1.2.3