[SPARK-12542][SQL] support except/intersect in HiveQl

Parse the SQL query with except/intersect in FROM clause for HivQL. Author: Davies Liu <davies@databricks.com> Closes #10622 from davies/intersect.
author: Davies Liu <davies@databricks.com> 2016-01-06 23:46:12 -0800
committer: Reynold Xin <rxin@databricks.com> 2016-01-06 23:46:12 -0800
commit: fd1dcfaf2608c2cc3a439ed3ca044ae655982306 (patch)
tree: 01c77444dc5ac7cccc80447f7103dd1801e8a361 /sql
parent: 6a1c864ab6ee3e869a16ffdbaf6fead21c7aac6d (diff)
download: spark-fd1dcfaf2608c2cc3a439ed3ca044ae655982306.tar.gz
spark-fd1dcfaf2608c2cc3a439ed3ca044ae655982306.tar.bz2
spark-fd1dcfaf2608c2cc3a439ed3ca044ae655982306.zip
5 files changed, 65 insertions, 5 deletions
diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlLexer.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlLexer.g
index e01e7101d0..44a63fbef2 100644
--- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlLexer.g
+++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlLexer.g
@@ -103,6 +103,7 @@ KW_CLUSTER: 'CLUSTER';
 KW_DISTRIBUTE: 'DISTRIBUTE';
 KW_SORT: 'SORT';
 KW_UNION: 'UNION';
+KW_EXCEPT: 'EXCEPT';
 KW_LOAD: 'LOAD';
 KW_EXPORT: 'EXPORT';
 KW_IMPORT: 'IMPORT';
diff --git a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
index 4afce3090f..cf8a56566d 100644
--- a/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
+++ b/sql/catalyst/src/main/antlr3/org/apache/spark/sql/catalyst/parser/SparkSqlParser.g
@@ -88,6 +88,8 @@ TOK_DISTRIBUTEBY;
 TOK_SORTBY;
 TOK_UNIONALL;
 TOK_UNIONDISTINCT;
+TOK_EXCEPT;
+TOK_INTERSECT;
 TOK_JOIN;
 TOK_LEFTOUTERJOIN;
 TOK_RIGHTOUTERJOIN;
@@ -2122,6 +2124,8 @@ setOperator
 @after { popMsg(state); }
     : KW_UNION KW_ALL -> ^(TOK_UNIONALL)
     | KW_UNION KW_DISTINCT? -> ^(TOK_UNIONDISTINCT)
+    | KW_EXCEPT -> ^(TOK_EXCEPT)
+    | KW_INTERSECT -> ^(TOK_INTERSECT)
     ;
 
 queryStatementExpression[boolean topLevel]
@@ -2242,7 +2246,7 @@ setOpSelectStatement[CommonTree t, boolean topLevel]
       ^(TOK_QUERY
           ^(TOK_FROM
             ^(TOK_SUBQUERY
-              ^(TOK_UNIONALL {$setOpSelectStatement.tree} $b)
+              ^($u {$setOpSelectStatement.tree} $b)
               {adaptor.create(Identifier, generateUnionAlias())}
              )
           )
@@ -2252,12 +2256,12 @@ setOpSelectStatement[CommonTree t, boolean topLevel]
           )
        )
    -> {$setOpSelectStatement.tree != null && $u.tree.getType()!=SparkSqlParser.TOK_UNIONDISTINCT}?
-      ^(TOK_UNIONALL {$setOpSelectStatement.tree} $b)
+      ^($u {$setOpSelectStatement.tree} $b)
    -> {$setOpSelectStatement.tree == null && $u.tree.getType()==SparkSqlParser.TOK_UNIONDISTINCT}?
       ^(TOK_QUERY
           ^(TOK_FROM
             ^(TOK_SUBQUERY
-              ^(TOK_UNIONALL {$t} $b)
+              ^($u {$t} $b)
               {adaptor.create(Identifier, generateUnionAlias())}
              )
            )
@@ -2266,7 +2270,7 @@ setOpSelectStatement[CommonTree t, boolean topLevel]
             ^(TOK_SELECTDI ^(TOK_SELEXPR TOK_ALLCOLREF))
          )
        )
-   -> ^(TOK_UNIONALL {$t} $b)
+   -> ^($u {$t} $b)
    )+
    o=orderByClause?
    c=clusterByClause?
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala
index 42bdf25b61..1eda4a9a97 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/CatalystQl.scala
@@ -399,9 +399,14 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       // return With plan if there is CTE
       cteRelations.map(With(query, _)).getOrElse(query)
 
-    // HIVE-9039 renamed TOK_UNION => TOK_UNIONALL while adding TOK_UNIONDISTINCT
     case Token("TOK_UNIONALL", left :: right :: Nil) =>
       Union(nodeToPlan(left), nodeToPlan(right))
+    case Token("TOK_UNIONDISTINCT", left :: right :: Nil) =>
+      Distinct(Union(nodeToPlan(left), nodeToPlan(right)))
+    case Token("TOK_EXCEPT", left :: right :: Nil) =>
+      Except(nodeToPlan(left), nodeToPlan(right))
+    case Token("TOK_INTERSECT", left :: right :: Nil) =>
+      Intersect(nodeToPlan(left), nodeToPlan(right))
 
     case _ =>
       noParseRule("Plan", node)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
new file mode 100644
index 0000000000..0fee97fb07
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/CatalystQlSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import org.apache.spark.sql.catalyst.plans.PlanTest
+
+class CatalystQlSuite extends PlanTest {
+
+  test("parse union/except/intersect") {
+    val paresr = new CatalystQl()
+    paresr.createPlan("select * from t1 union all select * from t2")
+    paresr.createPlan("select * from t1 union distinct select * from t2")
+    paresr.createPlan("select * from t1 union select * from t2")
+    paresr.createPlan("select * from t1 except select * from t2")
+    paresr.createPlan("select * from t1 intersect select * from t2")
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 98e22c2e2c..fa99289b41 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -787,6 +787,24 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     assert(sql("select key from src having key > 490").collect().size < 100)
   }
 
+  test("union/except/intersect") {
+    assertResult(Array(Row(1), Row(1))) {
+      sql("select 1 as a union all select 1 as a").collect()
+    }
+    assertResult(Array(Row(1))) {
+      sql("select 1 as a union distinct select 1 as a").collect()
+    }
+    assertResult(Array(Row(1))) {
+      sql("select 1 as a union select 1 as a").collect()
+    }
+    assertResult(Array()) {
+      sql("select 1 as a except select 1 as a").collect()
+    }
+    assertResult(Array(Row(1))) {
+      sql("select 1 as a intersect select 1 as a").collect()
+    }
+  }
+
   test("SPARK-5383 alias for udfs with multi output columns") {
     assert(
       sql("select stack(2, key, value, key, value) as (a, b) from src limit 5")
author	Davies Liu <davies@databricks.com>	2016-01-06 23:46:12 -0800
committer	Reynold Xin <rxin@databricks.com>	2016-01-06 23:46:12 -0800
commit	fd1dcfaf2608c2cc3a439ed3ca044ae655982306 (patch)
tree	01c77444dc5ac7cccc80447f7103dd1801e8a361 /sql
parent	6a1c864ab6ee3e869a16ffdbaf6fead21c7aac6d (diff)
download	spark-fd1dcfaf2608c2cc3a439ed3ca044ae655982306.tar.gz spark-fd1dcfaf2608c2cc3a439ed3ca044ae655982306.tar.bz2 spark-fd1dcfaf2608c2cc3a439ed3ca044ae655982306.zip