aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2016-01-20 14:59:30 -0800
committerReynold Xin <rxin@databricks.com>2016-01-20 14:59:30 -0800
commit8f90c151878571e20625e2a53561441ec0035dfc (patch)
treeb9b4354468e5e2f220c14ac520a960c94e0274b5 /sql/hive
parentb7d74a602f622d8e105b349bd6d17ba42e7668dc (diff)
downloadspark-8f90c151878571e20625e2a53561441ec0035dfc.tar.gz
spark-8f90c151878571e20625e2a53561441ec0035dfc.tar.bz2
spark-8f90c151878571e20625e2a53561441ec0035dfc.zip
[SPARK-12616][SQL] Making Logical Operator `Union` Support Arbitrary Number of Children
The existing `Union` logical operator only supports two children. Thus, adding a new logical operator `Unions` which can have arbitrary number of children to replace the existing one. `Union` logical plan is a binary node. However, a typical use case for union is to union a very large number of input sources (DataFrames, RDDs, or files). It is not uncommon to union hundreds of thousands of files. In this case, our optimizer can become very slow due to the large number of logical unions. We should change the Union logical plan to support an arbitrary number of children, and add a single rule in the optimizer to collapse all adjacent `Unions` into a single `Unions`. Note that this problem doesn't exist in physical plan, because the physical `Unions` already supports arbitrary number of children. Author: gatorsmile <gatorsmile@gmail.com> Author: xiaoli <lixiao1983@gmail.com> Author: Xiao Li <xiaoli@Xiaos-MacBook-Pro.local> Closes #10577 from gatorsmile/unionAllMultiChildren.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala12
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala4
2 files changed, 11 insertions, 5 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
index e83b4bffff..1654594538 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala
@@ -129,11 +129,13 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi
conditionSQL = condition.sql
} yield s"$childSQL $whereOrHaving $conditionSQL"
- case Union(left, right) =>
- for {
- leftSQL <- toSQL(left)
- rightSQL <- toSQL(right)
- } yield s"$leftSQL UNION ALL $rightSQL"
+ case Union(children) if children.length > 1 =>
+ val childrenSql = children.map(toSQL(_))
+ if (childrenSql.exists(_.isEmpty)) {
+ None
+ } else {
+ Some(childrenSql.map(_.get).mkString(" UNION ALL "))
+ }
// Persisted data source relation
case Subquery(alias, LogicalRelation(_, _, Some(TableIdentifier(table, Some(database))))) =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
index 0604d9f47c..261a4746f4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala
@@ -105,6 +105,10 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils {
checkHiveQl("SELECT id FROM t0 UNION ALL SELECT CAST(id AS INT) AS id FROM t0")
}
+ test("three-child union") {
+ checkHiveQl("SELECT id FROM t0 UNION ALL SELECT id FROM t0 UNION ALL SELECT id FROM t0")
+ }
+
test("case") {
checkHiveQl("SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM t0")
}