aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2016-02-11 08:40:27 +0100
committerHerman van Hovell <hvanhovell@questtec.nl>2016-02-11 08:40:27 +0100
commite88bff12795a6134e2e7204996b603e948380e18 (patch)
treed4c5b19801ebfce2e08b5b4bfee08b679f3f3f6a /sql
parent1842c55d89ae99a610a955ce61633a9084e000f2 (diff)
downloadspark-e88bff12795a6134e2e7204996b603e948380e18.tar.gz
spark-e88bff12795a6134e2e7204996b603e948380e18.tar.bz2
spark-e88bff12795a6134e2e7204996b603e948380e18.zip
[SPARK-13235][SQL] Removed an Extra Distinct from the Plan when Using Union in SQL
Currently, the parser added two `Distinct` operators in the plan if we are using `Union` or `Union Distinct` in the SQL. This PR is to remove the extra `Distinct` from the plan. For example, before the fix, the following query has a plan with two `Distinct` ```scala sql("select * from t0 union select * from t0").explain(true) ``` ``` == Parsed Logical Plan == 'Project [unresolvedalias(*,None)] +- 'Subquery u_2 +- 'Distinct +- 'Project [unresolvedalias(*,None)] +- 'Subquery u_1 +- 'Distinct +- 'Union :- 'Project [unresolvedalias(*,None)] : +- 'UnresolvedRelation `t0`, None +- 'Project [unresolvedalias(*,None)] +- 'UnresolvedRelation `t0`, None == Analyzed Logical Plan == id: bigint Project [id#16L] +- Subquery u_2 +- Distinct +- Project [id#16L] +- Subquery u_1 +- Distinct +- Union :- Project [id#16L] : +- Subquery t0 : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Subquery t0 +- Relation[id#16L] ParquetRelation == Optimized Logical Plan == Aggregate [id#16L], [id#16L] +- Aggregate [id#16L], [id#16L] +- Union :- Project [id#16L] : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Relation[id#16L] ParquetRelation ``` After the fix, the plan is changed without the extra `Distinct` as follows: ``` == Parsed Logical Plan == 'Project [unresolvedalias(*,None)] +- 'Subquery u_1 +- 'Distinct +- 'Union :- 'Project [unresolvedalias(*,None)] : +- 'UnresolvedRelation `t0`, None +- 'Project [unresolvedalias(*,None)] +- 'UnresolvedRelation `t0`, None == Analyzed Logical Plan == id: bigint Project [id#17L] +- Subquery u_1 +- Distinct +- Union :- Project [id#16L] : +- Subquery t0 : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Subquery t0 +- Relation[id#16L] ParquetRelation == Optimized Logical Plan == Aggregate [id#17L], [id#17L] +- Union :- Project [id#16L] : +- Relation[id#16L] ParquetRelation +- Project [id#16L] +- Relation[id#16L] ParquetRelation ``` Author: gatorsmile <gatorsmile@gmail.com> Closes #11120 from gatorsmile/unionDistinct.
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g28
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala33
2 files changed, 32 insertions, 29 deletions
diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
index 9f2a5eb35c..24483ccb5d 100644
--- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
+++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
@@ -2370,34 +2370,8 @@ setOpSelectStatement[CommonTree t, boolean topLevel]
u=setOperator LPAREN b=simpleSelectStatement RPAREN
|
u=setOperator b=simpleSelectStatement)
- -> {$setOpSelectStatement.tree != null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}?
- ^(TOK_QUERY
- ^(TOK_FROM
- ^(TOK_SUBQUERY
- ^($u {$setOpSelectStatement.tree} $b)
- {adaptor.create(Identifier, generateUnionAlias())}
- )
- )
- ^(TOK_INSERT
- ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE))
- ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF))
- )
- )
- -> {$setOpSelectStatement.tree != null && $u.tree.getType()!=SparkSqlParser.TOK_UNIONDISTINCT}?
+ -> {$setOpSelectStatement.tree != null}?
^($u {$setOpSelectStatement.tree} $b)
- -> {$setOpSelectStatement.tree == null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}?
- ^(TOK_QUERY
- ^(TOK_FROM
- ^(TOK_SUBQUERY
- ^($u {$t} $b)
- {adaptor.create(Identifier, generateUnionAlias())}
- )
- )
- ^(TOK_INSERT
- ^(TOK_DESTINATION ^(TOK_DIR TOK_TMP_FILE))
- ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF))
- )
- )
-> ^($u {$t} $b)
)+
o=orderByClause?
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
index 682b77dc65..8d7d6b5bf5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
@@ -18,10 +18,10 @@
package org.apache.spark.sql.catalyst
import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction}
+import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
+import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.unsafe.types.CalendarInterval
class CatalystQlSuite extends PlanTest {
@@ -45,6 +45,35 @@ class CatalystQlSuite extends PlanTest {
comparePlans(parsed, expected)
}
+ test("test Union Distinct operator") {
+ val parsed1 = parser.parsePlan("SELECT * FROM t0 UNION SELECT * FROM t1")
+ val parsed2 = parser.parsePlan("SELECT * FROM t0 UNION DISTINCT SELECT * FROM t1")
+ val expected =
+ Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+ Subquery("u_1",
+ Distinct(
+ Union(
+ Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+ UnresolvedRelation(TableIdentifier("t0"), None)),
+ Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+ UnresolvedRelation(TableIdentifier("t1"), None))))))
+ comparePlans(parsed1, expected)
+ comparePlans(parsed2, expected)
+ }
+
+ test("test Union All operator") {
+ val parsed = parser.parsePlan("SELECT * FROM t0 UNION ALL SELECT * FROM t1")
+ val expected =
+ Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+ Subquery("u_1",
+ Union(
+ Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+ UnresolvedRelation(TableIdentifier("t0"), None)),
+ Project(UnresolvedAlias(UnresolvedStar(None)) :: Nil,
+ UnresolvedRelation(TableIdentifier("t1"), None)))))
+ comparePlans(parsed, expected)
+ }
+
test("support hive interval literal") {
def checkInterval(sql: String, result: CalendarInterval): Unit = {
val parsed = parser.parsePlan(sql)