From 8f90c151878571e20625e2a53561441ec0035dfc Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Wed, 20 Jan 2016 14:59:30 -0800 Subject: [SPARK-12616][SQL] Making Logical Operator `Union` Support Arbitrary Number of Children The existing `Union` logical operator only supports two children. Thus, adding a new logical operator `Unions` which can have arbitrary number of children to replace the existing one. `Union` logical plan is a binary node. However, a typical use case for union is to union a very large number of input sources (DataFrames, RDDs, or files). It is not uncommon to union hundreds of thousands of files. In this case, our optimizer can become very slow due to the large number of logical unions. We should change the Union logical plan to support an arbitrary number of children, and add a single rule in the optimizer to collapse all adjacent `Unions` into a single `Unions`. Note that this problem doesn't exist in physical plan, because the physical `Unions` already supports arbitrary number of children. Author: gatorsmile Author: xiaoli Author: Xiao Li Closes #10577 from gatorsmile/unionAllMultiChildren. --- .../main/scala/org/apache/spark/sql/hive/SQLBuilder.scala | 12 +++++++----- .../org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala | 4 ++++ 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'sql/hive') diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala index e83b4bffff..1654594538 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/SQLBuilder.scala @@ -129,11 +129,13 @@ class SQLBuilder(logicalPlan: LogicalPlan, sqlContext: SQLContext) extends Loggi conditionSQL = condition.sql } yield s"$childSQL $whereOrHaving $conditionSQL" - case Union(left, right) => - for { - leftSQL <- toSQL(left) - rightSQL <- toSQL(right) - } yield s"$leftSQL UNION ALL $rightSQL" + case Union(children) if children.length > 1 => + val childrenSql = children.map(toSQL(_)) + if (childrenSql.exists(_.isEmpty)) { + None + } else { + Some(childrenSql.map(_.get).mkString(" UNION ALL ")) + } // Persisted data source relation case Subquery(alias, LogicalRelation(_, _, Some(TableIdentifier(table, Some(database))))) => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala index 0604d9f47c..261a4746f4 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/LogicalPlanToSQLSuite.scala @@ -105,6 +105,10 @@ class LogicalPlanToSQLSuite extends SQLBuilderTest with SQLTestUtils { checkHiveQl("SELECT id FROM t0 UNION ALL SELECT CAST(id AS INT) AS id FROM t0") } + test("three-child union") { + checkHiveQl("SELECT id FROM t0 UNION ALL SELECT id FROM t0 UNION ALL SELECT id FROM t0") + } + test("case") { checkHiveQl("SELECT CASE WHEN id % 2 > 0 THEN 0 WHEN id % 2 = 0 THEN 1 END FROM t0") } -- cgit v1.2.3