diff options
author | Liang-Chi Hsieh <viirya@gmail.com> | 2017-02-04 15:57:56 -0800 |
---|---|---|
committer | gatorsmile <gatorsmile@gmail.com> | 2017-02-04 15:57:56 -0800 |
commit | 0674e7eb85160e3f8da333b5243d76063824d58c (patch) | |
tree | 6ce795f0228a96b87364009644e25c759064e0db /sql/catalyst | |
parent | 2f3c20bbddd266015d9478c35ce2b37d67e01200 (diff) | |
download | spark-0674e7eb85160e3f8da333b5243d76063824d58c.tar.gz spark-0674e7eb85160e3f8da333b5243d76063824d58c.tar.bz2 spark-0674e7eb85160e3f8da333b5243d76063824d58c.zip |
[SPARK-19425][SQL] Make ExtractEquiJoinKeys support UDT columns
## What changes were proposed in this pull request?
DataFrame.except doesn't work for UDT columns. It is because `ExtractEquiJoinKeys` will run `Literal.default` against UDT. However, we don't handle UDT in `Literal.default` and an exception will throw like:
java.lang.RuntimeException: no default for type
org.apache.spark.ml.linalg.VectorUDT3bfc3ba7
at org.apache.spark.sql.catalyst.expressions.Literal$.default(literals.scala:179)
at org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys$$anonfun$4.apply(patterns.scala:117)
at org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys$$anonfun$4.apply(patterns.scala:110)
More simple fix is just let `Literal.default` handle UDT by its sql type. So we can use more efficient join type on UDT.
Besides `except`, this also fixes other similar scenarios, so in summary this fixes:
* `except` on two Datasets with UDT
* `intersect` on two Datasets with UDT
* `Join` with the join conditions using `<=>` on UDT columns
## How was this patch tested?
Jenkins tests.
Please review http://spark.apache.org/contributing.html before opening a pull request.
Author: Liang-Chi Hsieh <viirya@gmail.com>
Closes #16765 from viirya/df-except-for-udt.
Diffstat (limited to 'sql/catalyst')
2 files changed, 4 insertions, 0 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index cb0c4d333b..e66fb89339 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -175,6 +175,7 @@ object Literal { case map: MapType => create(Map(), map) case struct: StructType => create(InternalRow.fromSeq(struct.fields.map(f => default(f.dataType).value)), struct) + case udt: UserDefinedType[_] => default(udt.sqlType) case other => throw new RuntimeException(s"no default for type $dataType") } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index 4af4da8a9f..15e8e6c057 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.CatalystTypeConverters +import org.apache.spark.sql.catalyst.encoders.ExamplePointUDT import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval @@ -67,6 +68,8 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Literal.default(ArrayType(StringType)), Array()) checkEvaluation(Literal.default(MapType(IntegerType, StringType)), Map()) checkEvaluation(Literal.default(StructType(StructField("a", StringType) :: Nil)), Row("")) + // ExamplePointUDT.sqlType is ArrayType(DoubleType, false). + checkEvaluation(Literal.default(new ExamplePointUDT), Array()) } test("boolean literals") { |