[SPARK-17077][SQL] Cardinality estimation for project operator

## What changes were proposed in this pull request? Support cardinality estimation for project operator. ## How was this patch tested? Add a test suite and a base class in the catalyst package. Author: Zhenhua Wang <wzh_zju@163.com> Closes #16430 from wzhfy/projectEstimation.
author: Zhenhua Wang <wzh_zju@163.com> 2017-01-08 21:15:52 -0800
committer: Reynold Xin <rxin@databricks.com> 2017-01-08 21:15:52 -0800
commit: 3ccabdfb4d760d684b1e0c0ed448a57331f209f2 (patch)
tree: 7d884209ea08ba5350b161e6bd5806b6083680db /sql/catalyst/src/test
parent: 19d9d4c855eab8f647a5ec66b079172de81221d0 (diff)
download: spark-3ccabdfb4d760d684b1e0c0ed448a57331f209f2.tar.gz
spark-3ccabdfb4d760d684b1e0c0ed448a57331f209f2.tar.bz2
spark-3ccabdfb4d760d684b1e0c0ed448a57331f209f2.zip
2 files changed, 92 insertions, 0 deletions
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala
new file mode 100644
index 0000000000..4a1bed84f8
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/ProjectEstimationSuite.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.statsEstimation
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeMap, AttributeReference}
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
+import org.apache.spark.sql.types.IntegerType
+
+
+class ProjectEstimationSuite extends StatsEstimationTestBase {
+
+  test("estimate project with alias") {
+    val ar1 = AttributeReference("key1", IntegerType)()
+    val ar2 = AttributeReference("key2", IntegerType)()
+    val colStat1 = ColumnStat(2, Some(1), Some(2), 0, 4, 4)
+    val colStat2 = ColumnStat(1, Some(10), Some(10), 0, 4, 4)
+
+    val child = StatsTestPlan(
+      outputList = Seq(ar1, ar2),
+      stats = Statistics(
+        sizeInBytes = 2 * (4 + 4),
+        rowCount = Some(2),
+        attributeStats = AttributeMap(Seq(ar1 -> colStat1, ar2 -> colStat2))))
+
+    val project = Project(Seq(ar1, Alias(ar2, "abc")()), child)
+    val expectedColStats = Seq("key1" -> colStat1, "abc" -> colStat2)
+    val expectedAttrStats = toAttributeMap(expectedColStats, project)
+    // The number of rows won't change for project.
+    val expectedStats = Statistics(
+      sizeInBytes = 2 * getRowSize(project.output, expectedAttrStats),
+      rowCount = Some(2),
+      attributeStats = expectedAttrStats)
+    assert(project.statistics == expectedStats)
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
new file mode 100644
index 0000000000..fa5b290ecb
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/StatsEstimationTestBase.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.statsEstimation
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics}
+
+
+class StatsEstimationTestBase extends SparkFunSuite {
+
+  /** Convert (column name, column stat) pairs to an AttributeMap based on plan output. */
+  def toAttributeMap(colStats: Seq[(String, ColumnStat)], plan: LogicalPlan)
+    : AttributeMap[ColumnStat] = {
+    val nameToAttr: Map[String, Attribute] = plan.output.map(a => (a.name, a)).toMap
+    AttributeMap(colStats.map(kv => nameToAttr(kv._1) -> kv._2))
+  }
+}
+
+/**
+ * This class is used for unit-testing. It's a logical plan whose output and stats are passed in.
+ */
+protected case class StatsTestPlan(outputList: Seq[Attribute], stats: Statistics) extends LeafNode {
+  override def output: Seq[Attribute] = outputList
+  override lazy val statistics = stats
+}
author	Zhenhua Wang <wzh_zju@163.com>	2017-01-08 21:15:52 -0800
committer	Reynold Xin <rxin@databricks.com>	2017-01-08 21:15:52 -0800
commit	3ccabdfb4d760d684b1e0c0ed448a57331f209f2 (patch)
tree	7d884209ea08ba5350b161e6bd5806b6083680db /sql/catalyst/src/test
parent	19d9d4c855eab8f647a5ec66b079172de81221d0 (diff)
download	spark-3ccabdfb4d760d684b1e0c0ed448a57331f209f2.tar.gz spark-3ccabdfb4d760d684b1e0c0ed448a57331f209f2.tar.bz2 spark-3ccabdfb4d760d684b1e0c0ed448a57331f209f2.zip