aboutsummaryrefslogtreecommitdiff
path: root/sql/core/src
diff options
context:
space:
mode:
authorHerman van Hovell <hvanhovell@databricks.com>2017-02-07 22:28:59 +0100
committerHerman van Hovell <hvanhovell@databricks.com>2017-02-07 22:28:59 +0100
commit73ee73945e369a862480ef4ac64e55c797bd7d90 (patch)
treede712e38633bacba3e3c5f94b9c51bcd4259ba90 /sql/core/src
parentb7277e03d1038e2a19495c0ef7707e2d77937ccf (diff)
downloadspark-73ee73945e369a862480ef4ac64e55c797bd7d90.tar.gz
spark-73ee73945e369a862480ef4ac64e55c797bd7d90.tar.bz2
spark-73ee73945e369a862480ef4ac64e55c797bd7d90.zip
[SPARK-18609][SPARK-18841][SQL] Fix redundant Alias removal in the optimizer
## What changes were proposed in this pull request? The optimizer tries to remove redundant alias only projections from the query plan using the `RemoveAliasOnlyProject` rule. The current rule identifies removes such a project and rewrites the project's attributes in the **entire** tree. This causes problems when parts of the tree are duplicated (for instance a self join on a temporary view/CTE) and the duplicated part contains the alias only project, in this case the rewrite will break the tree. This PR fixes these problems by using a blacklist for attributes that are not to be moved, and by making sure that attribute remapping is only done for the parent tree, and not for unrelated parts of the query plan. The current tree transformation infrastructure works very well if the transformation at hand requires little or a global contextual information. In this case we need to know both the attributes that were not to be moved, and we also needed to know which child attributes were modified. This cannot be done easily using the current infrastructure, and solutions typically involves transversing the query plan multiple times (which is super slow). I have moved around some code in `TreeNode`, `QueryPlan` and `LogicalPlan`to make this much more straightforward; this basically allows you to manually traverse the tree. This PR subsumes the following PRs by windpiger: Closes https://github.com/apache/spark/pull/16267 Closes https://github.com/apache/spark/pull/16255 ## How was this patch tested? I have added unit tests to `RemoveRedundantAliasAndProjectSuite` and I have added integration tests to the `SQLQueryTestSuite.union` and `SQLQueryTestSuite.cte` test cases. Author: Herman van Hovell <hvanhovell@databricks.com> Closes #16757 from hvanhovell/SPARK-18609.
Diffstat (limited to 'sql/core/src')
-rw-r--r--sql/core/src/test/resources/sql-tests/inputs/cte.sql15
-rw-r--r--sql/core/src/test/resources/sql-tests/inputs/union.sql16
-rw-r--r--sql/core/src/test/resources/sql-tests/results/cte.sql.out49
-rw-r--r--sql/core/src/test/resources/sql-tests/results/union.sql.out70
4 files changed, 146 insertions, 4 deletions
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
index 3914db2691..d34d89f235 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -12,3 +12,18 @@ WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2;
-- WITH clause should reference the previous CTE
WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2;
+
+-- SPARK-18609 CTE with self-join
+WITH CTE1 AS (
+ SELECT b.id AS id
+ FROM T2 a
+ CROSS JOIN (SELECT id AS id FROM T2) b
+)
+SELECT t1.id AS c1,
+ t2.id AS c2
+FROM CTE1 t1
+ CROSS JOIN CTE1 t2;
+
+-- Clean up
+DROP VIEW IF EXISTS t;
+DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/union.sql b/sql/core/src/test/resources/sql-tests/inputs/union.sql
index 1f4780abde..e57d69eaad 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/union.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/union.sql
@@ -22,6 +22,22 @@ FROM (SELECT 0 a, 0 b
SELECT SUM(1) a, CAST(0 AS BIGINT) b
UNION ALL SELECT 0 a, 0 b) T;
+-- Regression test for SPARK-18841 Push project through union should not be broken by redundant alias removal.
+CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col);
+CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col);
+CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col);
+SELECT 1 AS x,
+ col
+FROM (SELECT col AS col
+ FROM (SELECT p1.col AS col
+ FROM p1 CROSS JOIN p2
+ UNION ALL
+ SELECT col
+ FROM p3) T1) T2;
+
-- Clean-up
DROP VIEW IF EXISTS t1;
DROP VIEW IF EXISTS t2;
+DROP VIEW IF EXISTS p1;
+DROP VIEW IF EXISTS p2;
+DROP VIEW IF EXISTS p3;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index 9fbad8f380..a446c2cd18 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 6
+-- Number of queries: 9
-- !query 0
@@ -55,3 +55,50 @@ struct<id:int,2:int>
0 2
1 2
1 2
+
+
+-- !query 6
+WITH CTE1 AS (
+ SELECT b.id AS id
+ FROM T2 a
+ CROSS JOIN (SELECT id AS id FROM T2) b
+)
+SELECT t1.id AS c1,
+ t2.id AS c2
+FROM CTE1 t1
+ CROSS JOIN CTE1 t2
+-- !query 6 schema
+struct<c1:int,c2:int>
+-- !query 6 output
+0 0
+0 0
+0 0
+0 0
+0 1
+0 1
+0 1
+0 1
+1 0
+1 0
+1 0
+1 0
+1 1
+1 1
+1 1
+1 1
+
+
+-- !query 7
+DROP VIEW IF EXISTS t
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+DROP VIEW IF EXISTS t2
+-- !query 8 schema
+struct<>
+-- !query 8 output
+
diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out
index c57028cabe..d123b7fdbe 100644
--- a/sql/core/src/test/resources/sql-tests/results/union.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 7
+-- Number of queries: 14
-- !query 0
@@ -65,7 +65,7 @@ struct<a:bigint>
-- !query 5
-DROP VIEW IF EXISTS t1
+CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col)
-- !query 5 schema
struct<>
-- !query 5 output
@@ -73,8 +73,72 @@ struct<>
-- !query 6
-DROP VIEW IF EXISTS t2
+CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col)
-- !query 6 schema
struct<>
-- !query 6 output
+
+
+-- !query 7
+CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col)
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 1 AS x,
+ col
+FROM (SELECT col AS col
+ FROM (SELECT p1.col AS col
+ FROM p1 CROSS JOIN p2
+ UNION ALL
+ SELECT col
+ FROM p3) T1) T2
+-- !query 8 schema
+struct<x:int,col:int>
+-- !query 8 output
+1 1
+1 1
+
+
+-- !query 9
+DROP VIEW IF EXISTS t1
+-- !query 9 schema
+struct<>
+-- !query 9 output
+
+
+
+-- !query 10
+DROP VIEW IF EXISTS t2
+-- !query 10 schema
+struct<>
+-- !query 10 output
+
+
+
+-- !query 11
+DROP VIEW IF EXISTS p1
+-- !query 11 schema
+struct<>
+-- !query 11 output
+
+
+
+-- !query 12
+DROP VIEW IF EXISTS p2
+-- !query 12 schema
+struct<>
+-- !query 12 output
+
+
+
+-- !query 13
+DROP VIEW IF EXISTS p3
+-- !query 13 schema
+struct<>
+-- !query 13 output
+