aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala4
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala19
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala7
3 files changed, 29 insertions, 1 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index ffb9548356..00449c2007 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -263,6 +263,8 @@ private[sql] object JsonRDD extends Logging {
val elementType = typeOfArray(array)
buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType)
}
+ // we couldn't tell what the type is if the value is null or empty string
+ case (key: String, value) if value == "" || value == null => (key, NullType) :: Nil
case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
}
}
@@ -400,13 +402,13 @@ private[sql] object JsonRDD extends Logging {
} else {
desiredType match {
case StringType => toString(value)
+ case _ if value == null || value == "" => null // guard the non string type
case IntegerType => value.asInstanceOf[IntegerType.JvmType]
case LongType => toLong(value)
case DoubleType => toDouble(value)
case DecimalType() => toDecimal(value)
case BooleanType => value.asInstanceOf[BooleanType.JvmType]
case NullType => null
-
case ArrayType(elementType, _) =>
value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index f088d41325..8dce3372a8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -193,6 +193,25 @@ class JsonSuite extends QueryTest {
StringType)
}
+ test("Complex field and type inferring with null in sampling") {
+ val jsonSchemaRDD = jsonRDD(jsonNullStruct)
+ val expectedSchema = StructType(
+ StructField("headers", StructType(
+ StructField("Charset", StringType, true) ::
+ StructField("Host", StringType, true) :: Nil)
+ , true) ::
+ StructField("ip", StringType, true) ::
+ StructField("nullstr", StringType, true):: Nil)
+
+ assert(expectedSchema === jsonSchemaRDD.schema)
+ jsonSchemaRDD.registerTempTable("jsonTable")
+
+ checkAnswer(
+ sql("select nullstr, headers.Host from jsonTable"),
+ Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null))
+ )
+ }
+
test("Primitive field and type inferring") {
val jsonSchemaRDD = jsonRDD(primitiveFieldAndType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index e5773a5587..3370b3c98b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -43,6 +43,13 @@ object TestJsonData {
"""{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470,
"num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
+ val jsonNullStruct =
+ TestSQLContext.sparkContext.parallelize(
+ """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
+ """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
+ """{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
+ """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
+
val complexFieldValueTypeConflict =
TestSQLContext.sparkContext.parallelize(
"""{"num_struct":11, "str_array":[1, 2, 3],