aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst
diff options
context:
space:
mode:
authorWenchen Fan <wenchen@databricks.com>2016-07-14 15:48:22 +0800
committerCheng Lian <lian@databricks.com>2016-07-14 15:48:22 +0800
commitdb7317ac3c2fd2a11088d10060f168178dc99664 (patch)
tree016edf4914d49f9563cbc800230ca2f144903e6f /sql/catalyst
parent39c836e976fcae51568bed5ebab28e148383b5d4 (diff)
downloadspark-db7317ac3c2fd2a11088d10060f168178dc99664.tar.gz
spark-db7317ac3c2fd2a11088d10060f168178dc99664.tar.bz2
spark-db7317ac3c2fd2a11088d10060f168178dc99664.zip
[SPARK-16448] RemoveAliasOnlyProject should not remove alias with metadata
## What changes were proposed in this pull request? `Alias` with metadata is not a no-op and we should not strip it in `RemoveAliasOnlyProject` rule. This PR also did some improvement for this rule: 1. extend the semantic of `alias-only`. Now we allow the project list to be partially aliased. 2. add unit test for this rule. ## How was this patch tested? new `RemoveAliasOnlyProjectSuite` Author: Wenchen Fan <wenchen@databricks.com> Closes #14106 from cloud-fan/bug.
Diffstat (limited to 'sql/catalyst')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala49
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala77
2 files changed, 108 insertions, 18 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 08fb0199fc..c8e9d8e2f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -165,36 +165,49 @@ object PushProjectThroughSample extends Rule[LogicalPlan] {
* but can also benefit other operators.
*/
object RemoveAliasOnlyProject extends Rule[LogicalPlan] {
- // Check if projectList in the Project node has the same attribute names and ordering
- // as its child node.
+ /**
+ * Returns true if the project list is semantically same as child output, after strip alias on
+ * attribute.
+ */
private def isAliasOnly(
projectList: Seq[NamedExpression],
childOutput: Seq[Attribute]): Boolean = {
- if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != childOutput.length) {
+ if (projectList.length != childOutput.length) {
false
} else {
- projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case (a, o) =>
- a.child match {
- case attr: Attribute if a.name == attr.name && attr.semanticEquals(o) => true
- case _ => false
- }
+ stripAliasOnAttribute(projectList).zip(childOutput).forall {
+ case (a: Attribute, o) if a semanticEquals o => true
+ case _ => false
}
}
}
+ private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = {
+ projectList.map {
+ // Alias with metadata can not be stripped, or the metadata will be lost.
+ // If the alias name is different from attribute name, we can't strip it either, or we may
+ // accidentally change the output schema name of the root plan.
+ case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name =>
+ attr
+ case other => other
+ }
+ }
+
def apply(plan: LogicalPlan): LogicalPlan = {
- val aliasOnlyProject = plan.find {
- case Project(pList, child) if isAliasOnly(pList, child.output) => true
- case _ => false
+ val aliasOnlyProject = plan.collectFirst {
+ case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p
}
- aliasOnlyProject.map { case p: Project =>
- val aliases = p.projectList.map(_.asInstanceOf[Alias])
- val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child)))
- plan.transformAllExpressions {
- case a: Attribute if attrMap.contains(a) => attrMap(a)
- }.transform {
- case op: Project if op.eq(p) => op.child
+ aliasOnlyProject.map { case proj =>
+ val attributesToReplace = proj.output.zip(proj.child.output).filterNot {
+ case (a1, a2) => a1 semanticEquals a2
+ }
+ val attrMap = AttributeMap(attributesToReplace)
+ plan transform {
+ case plan: Project if plan eq proj => plan.child
+ case plan => plan transformExpressions {
+ case a: Attribute if attrMap.contains(a) => attrMap(a)
+ }
}
}.getOrElse(plan)
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
new file mode 100644
index 0000000000..7c26cb5598
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveAliasOnlyProjectSuite.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.types.MetadataBuilder
+
+class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper {
+
+ object Optimize extends RuleExecutor[LogicalPlan] {
+ val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), RemoveAliasOnlyProject) :: Nil
+ }
+
+ test("all expressions in project list are aliased child output") {
+ val relation = LocalRelation('a.int, 'b.int)
+ val query = relation.select('a as 'a, 'b as 'b).analyze
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, relation)
+ }
+
+ test("all expressions in project list are aliased child output but with different order") {
+ val relation = LocalRelation('a.int, 'b.int)
+ val query = relation.select('b as 'b, 'a as 'a).analyze
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, query)
+ }
+
+ test("some expressions in project list are aliased child output") {
+ val relation = LocalRelation('a.int, 'b.int)
+ val query = relation.select('a as 'a, 'b).analyze
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, relation)
+ }
+
+ test("some expressions in project list are aliased child output but with different order") {
+ val relation = LocalRelation('a.int, 'b.int)
+ val query = relation.select('b as 'b, 'a).analyze
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, query)
+ }
+
+ test("some expressions in project list are not Alias or Attribute") {
+ val relation = LocalRelation('a.int, 'b.int)
+ val query = relation.select('a as 'a, 'b + 1).analyze
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, query)
+ }
+
+ test("some expressions in project list are aliased child output but with metadata") {
+ val relation = LocalRelation('a.int, 'b.int)
+ val metadata = new MetadataBuilder().putString("x", "y").build()
+ val aliasWithMeta = Alias('a, "a")(explicitMetadata = Some(metadata))
+ val query = relation.select(aliasWithMeta, 'b).analyze
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, query)
+ }
+}