diff options
author | gatorsmile <gatorsmile@gmail.com> | 2016-02-11 08:40:27 +0100 |
---|---|---|
committer | Herman van Hovell <hvanhovell@questtec.nl> | 2016-02-11 08:40:27 +0100 |
commit | e88bff12795a6134e2e7204996b603e948380e18 (patch) | |
tree | d4c5b19801ebfce2e08b5b4bfee08b679f3f3f6a /sql | |
parent | 1842c55d89ae99a610a955ce61633a9084e000f2 (diff) | |
download | spark-e88bff12795a6134e2e7204996b603e948380e18.tar.gz spark-e88bff12795a6134e2e7204996b603e948380e18.tar.bz2 spark-e88bff12795a6134e2e7204996b603e948380e18.zip |
[SPARK-13235][SQL] Removed an Extra Distinct from the Plan when Using Union in SQL
Currently, the parser added two `Distinct` operators in the plan if we are using `Union` or `Union Distinct` in the SQL. This PR is to remove the extra `Distinct` from the plan.
For example, before the fix, the following query has a plan with two `Distinct`
```scala
sql("select * from t0 union select * from t0").explain(true)
```
```
== Parsed Logical Plan ==
'Project [unresolvedalias(*,None)]
+- 'Subquery u_2
+- 'Distinct
+- 'Project [unresolvedalias(*,None)]
+- 'Subquery u_1
+- 'Distinct
+- 'Union
:- 'Project [unresolvedalias(*,None)]
: +- 'UnresolvedRelation `t0`, None
+- 'Project [unresolvedalias(*,None)]
+- 'UnresolvedRelation `t0`, None
== Analyzed Logical Plan ==
id: bigint
Project [id#16L]
+- Subquery u_2
+- Distinct
+- Project [id#16L]
+- Subquery u_1
+- Distinct
+- Union
:- Project [id#16L]
: +- Subquery t0
: +- Relation[id#16L] ParquetRelation
+- Project [id#16L]
+- Subquery t0
+- Relation[id#16L] ParquetRelation
== Optimized Logical Plan ==
Aggregate [id#16L], [id#16L]
+- Aggregate [id#16L], [id#16L]
+- Union
:- Project [id#16L]
: +- Relation[id#16L] ParquetRelation
+- Project [id#16L]
+- Relation[id#16L] ParquetRelation
```
After the fix, the plan is changed without the extra `Distinct` as follows:
```
== Parsed Logical Plan ==
'Project [unresolvedalias(*,None)]
+- 'Subquery u_1
+- 'Distinct
+- 'Union
:- 'Project [unresolvedalias(*,None)]
: +- 'UnresolvedRelation `t0`, None
+- 'Project [unresolvedalias(*,None)]
+- 'UnresolvedRelation `t0`, None
== Analyzed Logical Plan ==
id: bigint
Project [id#17L]
+- Subquery u_1
+- Distinct
+- Union
:- Project [id#16L]
: +- Subquery t0
: +- Relation[id#16L] ParquetRelation
+- Project [id#16L]
+- Subquery t0
+- Relation[id#16L] ParquetRelation
== Optimized Logical Plan ==
Aggregate [id#17L], [id#17L]
+- Union
:- Project [id#16L]
: +- Relation[id#16L] ParquetRelation
+- Project [id#16L]
+- Relation[id#16L] ParquetRelation
```
Author: gatorsmile <gatorsmile@gmail.com>
Closes #11120 from gatorsmile/unionDistinct.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g | 28 | ||||
-rw-r--r-- | sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala | 33 |
2 files changed, 32 insertions, 29 deletions
diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g index 9f2a5eb35c..24483ccb5d 100644 --- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g +++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g @@ -2370,34 +2370,8 @@ setOpSelectStatement[CommonTree t, boolean topLevel] u=setOperator LPAREN b=simpleSelectStatement RPAREN | u=setOperator b=simpleSelectStatement) - -> {$setOpSelectStatement.tree != null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}? - ^(TOK_QUERY - ^(TOK_FROM - ^(TOK_SUBQUERY - ^($u {$setOpSelectStatement.tree} $b) - {adaptor.create(Identifier, generateUnionAlias())} - ) - ) - ^(TOK_INSERT - ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE)) - ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF)) - ) - ) - -> {$setOpSelectStatement.tree != null && $u.tree.getType()!=SparkSqlParser.TOK_UNIONDISTINCT}? + -> {$setOpSelectStatement.tree != null}? ^($u {$setOpSelectStatement.tree} $b) - -> {$setOpSelectStatement.tree == null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}? - ^(TOK_QUERY - ^(TOK_FROM - ^(TOK_SUBQUERY - ^($u {$t} $b) - {adaptor.create(Identifier, generateUnionAlias())} - ) - ) - ^(TOK_INSERT - ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE)) - ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF)) - ) - ) -> ^($u {$t} $b) )+ o=orderByClause? diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala index 682b77dc65..8d7d6b5bf5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala @@ -18,10 +18,10 @@ package org.apache.spark.sql.catalyst import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction} +import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.unsafe.types.CalendarInterval class CatalystQlSuite extends PlanTest { @@ -45,6 +45,35 @@ class CatalystQlSuite extends PlanTest { comparePlans(parsed, expected) } + test("test Union Distinct operator") { + val parsed1 = parser.parsePlan("SELECT * FROM t0 UNION SELECT * FROM t1") + val parsed2 = parser.parsePlan("SELECT * FROM t0 UNION DISTINCT SELECT * FROM t1") + val expected = + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + Subquery("u_1", + Distinct( + Union( + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t0"), None)), + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t1"), None)))))) + comparePlans(parsed1, expected) + comparePlans(parsed2, expected) + } + + test("test Union All operator") { + val parsed = parser.parsePlan("SELECT * FROM t0 UNION ALL SELECT * FROM t1") + val expected = + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + Subquery("u_1", + Union( + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t0"), None)), + Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil, + UnresolvedRelation(TableIdentifier("t1"), None))))) + comparePlans(parsed, expected) + } + test("support hive interval literal") { def checkInterval(sql: String, result: CalendarInterval): Unit = { val parsed = parser.parsePlan(sql) |