aboutsummaryrefslogtreecommitdiff
path: root/sql/core/src/test
diff options
context:
space:
mode:
authorLianhui Wang <lianhuiwang09@gmail.com>2016-07-12 18:52:15 +0200
committerHerman van Hovell <hvanhovell@databricks.com>2016-07-12 18:52:15 +0200
commit5ad68ba5ce625c7005b540ca50ed001ca18de967 (patch)
treefb8d1e7f11c9ac6f2c4a89a8b384a702d489c6a5 /sql/core/src/test
parent6cb75db9ab1a4f227069bec2763b89546b88b0ee (diff)
downloadspark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.gz
spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.bz2
spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.zip
[SPARK-15752][SQL] Optimize metadata only query that has an aggregate whose children are deterministic project or filter operators.
## What changes were proposed in this pull request? when query only use metadata (example: partition key), it can return results based on metadata without scanning files. Hive did it in HIVE-1003. ## How was this patch tested? add unit tests Author: Lianhui Wang <lianhuiwang09@gmail.com> Author: Wenchen Fan <wenchen@databricks.com> Author: Lianhui Wang <lianhuiwang@users.noreply.github.com> Closes #13494 from lianhuiwang/metadata-only.
Diffstat (limited to 'sql/core/src/test')
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala36
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala120
2 files changed, 156 insertions, 0 deletions
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index ede7d9a0c9..eeaa0103a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2929,4 +2929,40 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
sql(s"SELECT '$literal' AS DUMMY"),
Row(s"$expected") :: Nil)
}
+
+ test("SPARK-15752 optimize metadata only query for datasource table") {
+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+ withTable("srcpart_15752") {
+ val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
+ .toDF("col1", "col2", "partcol1", "partcol2")
+ data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart_15752")
+ checkAnswer(
+ sql("select partcol1 from srcpart_15752 group by partcol1"),
+ Row(0) :: Row(1) :: Nil)
+ checkAnswer(
+ sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
+ Row(1))
+ checkAnswer(
+ sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
+ Row(0, 1) :: Row(1, 1) :: Nil)
+ checkAnswer(
+ sql("select partcol1, count(distinct partcol2) from srcpart_15752 where partcol1 = 1 " +
+ "group by partcol1"),
+ Row(1, 1) :: Nil)
+ checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
+ checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(
+ sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
+ "where partcol1 = 1) t"),
+ Row(2))
+ checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
+ checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
+ checkAnswer(
+ sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
+ "where partcol1 = 1) t"),
+ Row(2))
+ }
+ }
+ }
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
new file mode 100644
index 0000000000..58c310596c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
+ import testImplicits._
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "even" else "odd"))
+ .toDF("col1", "col2", "partcol1", "partcol2")
+ data.write.partitionBy("partcol1", "partcol2").mode("append").saveAsTable("srcpart")
+ }
+
+ override protected def afterAll(): Unit = {
+ try {
+ sql("DROP TABLE IF EXISTS srcpart")
+ } finally {
+ super.afterAll()
+ }
+ }
+
+ private def assertMetadataOnlyQuery(df: DataFrame): Unit = {
+ val localRelations = df.queryExecution.optimizedPlan.collect {
+ case l @ LocalRelation(_, _) => l
+ }
+ assert(localRelations.size == 1)
+ }
+
+ private def assertNotMetadataOnlyQuery(df: DataFrame): Unit = {
+ val localRelations = df.queryExecution.optimizedPlan.collect {
+ case l @ LocalRelation(_, _) => l
+ }
+ assert(localRelations.size == 0)
+ }
+
+ private def testMetadataOnly(name: String, sqls: String*): Unit = {
+ test(name) {
+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+ sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
+ }
+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+ sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+ }
+ }
+ }
+
+ private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
+ test(name) {
+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+ sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+ }
+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+ sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
+ }
+ }
+ }
+
+ testMetadataOnly(
+ "Aggregate expression is partition columns",
+ "select partcol1 from srcpart group by partcol1",
+ "select partcol2 from srcpart where partcol1 = 0 group by partcol2")
+
+ testMetadataOnly(
+ "Distinct aggregate function on partition columns",
+ "SELECT partcol1, count(distinct partcol2) FROM srcpart group by partcol1",
+ "SELECT partcol1, count(distinct partcol2) FROM srcpart where partcol1 = 0 group by partcol1")
+
+ testMetadataOnly(
+ "Distinct on partition columns",
+ "select distinct partcol1, partcol2 from srcpart",
+ "select distinct c1 from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
+
+ testMetadataOnly(
+ "Aggregate function on partition columns which have same result w or w/o DISTINCT keyword",
+ "select max(partcol1) from srcpart",
+ "select min(partcol1) from srcpart where partcol1 = 0",
+ "select first(partcol1) from srcpart",
+ "select last(partcol1) from srcpart where partcol1 = 0",
+ "select partcol2, min(partcol1) from srcpart where partcol1 = 0 group by partcol2",
+ "select max(c1) from (select partcol1 + 1 as c1 from srcpart where partcol1 = 0) t")
+
+ testNotMetadataOnly(
+ "Don't optimize metadata only query for non-partition columns",
+ "select col1 from srcpart group by col1",
+ "select partcol1, max(col1) from srcpart group by partcol1",
+ "select partcol1, count(distinct col1) from srcpart group by partcol1",
+ "select distinct partcol1, col1 from srcpart")
+
+ testNotMetadataOnly(
+ "Don't optimize metadata only query for non-distinct aggregate function on partition columns",
+ "select partcol1, sum(partcol2) from srcpart group by partcol1",
+ "select partcol1, count(partcol2) from srcpart group by partcol1")
+
+ testNotMetadataOnly(
+ "Don't optimize metadata only query for GroupingSet/Union operator",
+ "select partcol1, max(partcol2) from srcpart where partcol1 = 0 group by rollup (partcol1)",
+ "select partcol2 from (select partcol2 from srcpart where partcol1 = 0 union all " +
+ "select partcol2 from srcpart where partcol1 = 1) t group by partcol2")
+}