[SPARK-15752][SQL] Optimize metadata only query that has an aggregate whose children are deterministic project or filter operators.

## What changes were proposed in this pull request? when query only use metadata (example: partition key), it can return results based on metadata without scanning files. Hive did it in HIVE-1003. ## How was this patch tested? add unit tests Author: Lianhui Wang <lianhuiwang09@gmail.com> Author: Wenchen Fan <wenchen@databricks.com> Author: Lianhui Wang <lianhuiwang@users.noreply.github.com> Closes #13494 from lianhuiwang/metadata-only.
author: Lianhui Wang <lianhuiwang09@gmail.com> 2016-07-12 18:52:15 +0200
committer: Herman van Hovell <hvanhovell@databricks.com> 2016-07-12 18:52:15 +0200
commit: 5ad68ba5ce625c7005b540ca50ed001ca18de967 (patch)
tree: fb8d1e7f11c9ac6f2c4a89a8b384a702d489c6a5 /sql/core/src/test
parent: 6cb75db9ab1a4f227069bec2763b89546b88b0ee (diff)
download: spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.gz
spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.bz2
spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.zip
2 files changed, 156 insertions, 0 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index ede7d9a0c9..eeaa0103a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2929,4 +2929,40 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       sql(s"SELECT '$literal' AS DUMMY"),
       Row(s"$expected") :: Nil)
   }
+
+  test("SPARK-15752 optimize metadata only query for datasource table") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+      withTable("srcpart_15752") {
+        val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
+          .toDF("col1", "col2", "partcol1", "partcol2")
+        data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart_15752")
+        checkAnswer(
+          sql("select partcol1 from srcpart_15752 group by partcol1"),
+          Row(0) :: Row(1) :: Nil)
+        checkAnswer(
+          sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
+          Row(1))
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
+          Row(0, 1) :: Row(1, 1) :: Nil)
+        checkAnswer(
+          sql("select partcol1, count(distinct partcol2) from srcpart_15752  where partcol1 = 1 " +
+            "group by partcol1"),
+          Row(1, 1) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
+        checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(
+          sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+        checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
+        checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
+        checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
+        checkAnswer(
+          sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
+            "where partcol1 = 1) t"),
+          Row(2))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
new file mode 100644
index 0000000000..58c310596c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd"))
+      .toDF("col1", "col2", "partcol1", "partcol2")
+    data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart")
+  }
+
+  override protected def afterAll(): Unit = {
+    try {
+      sql("DROP TABLE IF EXISTS srcpart")
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  private def assertMetadataOnlyQuery(df: DataFrame): Unit = {
+    val localRelations = df.queryExecution.optimizedPlan.collect {
+      case l @ LocalRelation(_, _) => l
+    }
+    assert(localRelations.size == 1)
+  }
+
+  private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = {
+    val localRelations = df.queryExecution.optimizedPlan.collect {
+      case l @ LocalRelation(_, _) => l
+    }
+    assert(localRelations.size == 0)
+  }
+
+  private def testMetadataOnly(name: String, sqls: String*): Unit = {
+    test(name) {
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+        sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
+      }
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+      }
+    }
+  }
+
+  private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
+    test(name) {
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+      }
+      withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+        sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+      }
+    }
+  }
+
+  testMetadataOnly(
+    "Aggregate expression is partition columns",
+    "select partcol1 from srcpart group by partcol1",
+    "select partcol2 from srcpart where partcol1 = 0 group by partcol2")
+
+  testMetadataOnly(
+    "Distinct aggregate function on partition columns",
+    "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1",
+    "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1")
+
+  testMetadataOnly(
+    "Distinct on partition columns",
+    "select distinct partcol1, partcol2 from srcpart",
+    "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
+
+  testMetadataOnly(
+    "Aggregate function on partition columns which have same result w or w/o DISTINCT keyword",
+    "select max(partcol1) from srcpart",
+    "select min(partcol1) from srcpart where partcol1 = 0",
+    "select first(partcol1) from srcpart",
+    "select last(partcol1) from srcpart where partcol1 = 0",
+    "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2",
+    "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
+
+  testNotMetadataOnly(
+    "Don't optimize metadata only query for non-partition columns",
+    "select col1 from srcpart group by col1",
+    "select partcol1, max(col1) from srcpart group by partcol1",
+    "select partcol1, count(distinct col1) from srcpart group by partcol1",
+    "select distinct partcol1, col1 from srcpart")
+
+  testNotMetadataOnly(
+    "Don't optimize metadata only query for non-distinct aggregate function on partition columns",
+    "select partcol1, sum(partcol2) from srcpart group by partcol1",
+    "select partcol1, count(partcol2) from srcpart group by partcol1")
+
+  testNotMetadataOnly(
+    "Don't optimize metadata only query for GroupingSet/Union operator",
+    "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)",
+    "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " +
+      "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")
+}
author	Lianhui Wang <lianhuiwang09@gmail.com>	2016-07-12 18:52:15 +0200
committer	Herman van Hovell <hvanhovell@databricks.com>	2016-07-12 18:52:15 +0200
commit	5ad68ba5ce625c7005b540ca50ed001ca18de967 (patch)
tree	fb8d1e7f11c9ac6f2c4a89a8b384a702d489c6a5 /sql/core/src/test
parent	6cb75db9ab1a4f227069bec2763b89546b88b0ee (diff)
download	spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.gz spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.bz2 spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.zip