aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst
diff options
context:
space:
mode:
authorTakeshi Yamamuro <yamamuro@apache.org>2017-03-21 11:17:34 +0800
committerWenchen Fan <wenchen@databricks.com>2017-03-21 11:17:34 +0800
commit0ec1db5475f1a7839bdbf0d9cffe93ce6970a7fe (patch)
tree3a1c5d945583f7189018904ec033e469ae281b2e /sql/catalyst
parente9c91badce64731ffd3e53cbcd9f044a7593e6b8 (diff)
downloadspark-0ec1db5475f1a7839bdbf0d9cffe93ce6970a7fe.tar.gz
spark-0ec1db5475f1a7839bdbf0d9cffe93ce6970a7fe.tar.bz2
spark-0ec1db5475f1a7839bdbf0d9cffe93ce6970a7fe.zip
[SPARK-19980][SQL] Add NULL checks in Bean serializer
## What changes were proposed in this pull request? A Bean serializer in `ExpressionEncoder` could change values when Beans having NULL. A concrete example is as follows; ``` scala> :paste class Outer extends Serializable { private var cls: Inner = _ def setCls(c: Inner): Unit = cls = c def getCls(): Inner = cls } class Inner extends Serializable { private var str: String = _ def setStr(s: String): Unit = str = str def getStr(): String = str } scala> Seq("""{"cls":null}""", """{"cls": {"str":null}}""").toDF().write.text("data") scala> val encoder = Encoders.bean(classOf[Outer]) scala> val schema = encoder.schema scala> val df = spark.read.schema(schema).json("data").as[Outer](encoder) scala> df.show +------+ | cls| +------+ |[null]| | null| +------+ scala> df.map(x => x)(encoder).show() +------+ | cls| +------+ |[null]| |[null]| // <-- Value changed +------+ ``` This is because the Bean serializer does not have the NULL-check expressions that the serializer of Scala's product types has. Actually, this value change does not happen in Scala's product types; ``` scala> :paste case class Outer(cls: Inner) case class Inner(str: String) scala> val encoder = Encoders.product[Outer] scala> val schema = encoder.schema scala> val df = spark.read.schema(schema).json("data").as[Outer](encoder) scala> df.show +------+ | cls| +------+ |[null]| | null| +------+ scala> df.map(x => x)(encoder).show() +------+ | cls| +------+ |[null]| | null| +------+ ``` This pr added the NULL-check expressions in Bean serializer along with the serializer of Scala's product types. ## How was this patch tested? Added tests in `JavaDatasetSuite`. Author: Takeshi Yamamuro <yamamuro@apache.org> Closes #17347 from maropu/SPARK-19980.
Diffstat (limited to 'sql/catalyst')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala11
1 files changed, 9 insertions, 2 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
index 4ff87edde1..9d4617dda5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/JavaTypeInference.scala
@@ -343,7 +343,11 @@ object JavaTypeInference {
*/
def serializerFor(beanClass: Class[_]): CreateNamedStruct = {
val inputObject = BoundReference(0, ObjectType(beanClass), nullable = true)
- serializerFor(inputObject, TypeToken.of(beanClass)).asInstanceOf[CreateNamedStruct]
+ val nullSafeInput = AssertNotNull(inputObject, Seq("top level input bean"))
+ serializerFor(nullSafeInput, TypeToken.of(beanClass)) match {
+ case expressions.If(_, _, s: CreateNamedStruct) => s
+ case other => CreateNamedStruct(expressions.Literal("value") :: other :: Nil)
+ }
}
private def serializerFor(inputObject: Expression, typeToken: TypeToken[_]): Expression = {
@@ -427,7 +431,7 @@ object JavaTypeInference {
case other =>
val properties = getJavaBeanReadableAndWritableProperties(other)
- CreateNamedStruct(properties.flatMap { p =>
+ val nonNullOutput = CreateNamedStruct(properties.flatMap { p =>
val fieldName = p.getName
val fieldType = typeToken.method(p.getReadMethod).getReturnType
val fieldValue = Invoke(
@@ -436,6 +440,9 @@ object JavaTypeInference {
inferExternalType(fieldType.getRawType))
expressions.Literal(fieldName) :: serializerFor(fieldValue, fieldType) :: Nil
})
+
+ val nullOutput = expressions.Literal.create(null, nonNullOutput.dataType)
+ expressions.If(IsNull(inputObject), nullOutput, nonNullOutput)
}
}
}