aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorLianhui Wang <lianhuiwang09@gmail.com>2016-07-12 18:52:15 +0200
committerHerman van Hovell <hvanhovell@databricks.com>2016-07-12 18:52:15 +0200
commit5ad68ba5ce625c7005b540ca50ed001ca18de967 (patch)
treefb8d1e7f11c9ac6f2c4a89a8b384a702d489c6a5 /sql/hive
parent6cb75db9ab1a4f227069bec2763b89546b88b0ee (diff)
downloadspark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.gz
spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.tar.bz2
spark-5ad68ba5ce625c7005b540ca50ed001ca18de967.zip
[SPARK-15752][SQL] Optimize metadata only query that has an aggregate whose children are deterministic project or filter operators.
## What changes were proposed in this pull request? when query only use metadata (example: partition key), it can return results based on metadata without scanning files. Hive did it in HIVE-1003. ## How was this patch tested? add unit tests Author: Lianhui Wang <lianhuiwang09@gmail.com> Author: Wenchen Fan <wenchen@databricks.com> Author: Lianhui Wang <lianhuiwang@users.noreply.github.com> Closes #13494 from lianhuiwang/metadata-only.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala89
1 files changed, 89 insertions, 0 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index e8af4fbe87..a43f0d0d7e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1689,4 +1689,93 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
)
}
}
+
+ test("SPARK-15752 optimize metadata only query for hive table") {
+ withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
+ withTable("data_15752", "srcpart_15752", "srctext_15752") {
+ val df = Seq((1, "2"), (3, "4")).toDF("key", "value")
+ df.createOrReplaceTempView("data_15752")
+ sql(
+ """
+ |CREATE TABLE srcpart_15752 (col1 INT, col2 STRING)
+ |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS parquet
+ """.stripMargin)
+ for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) {
+ sql(
+ s"""
+ |INSERT OVERWRITE TABLE srcpart_15752
+ |PARTITION (partcol1='$partcol1', partcol2='$partcol2')
+ |select key, value from data_15752
+ """.stripMargin)
+ }
+ checkAnswer(
+ sql("select partcol1 from srcpart_15752 group by partcol1"),
+ Row(0) :: Row(1) :: Nil)
+ checkAnswer(
+ sql("select partcol1 from srcpart_15752 where partcol1 = 1 group by partcol1"),
+ Row(1))
+ checkAnswer(
+ sql("select partcol1, count(distinct partcol2) from srcpart_15752 group by partcol1"),
+ Row(0, 2) :: Row(1, 2) :: Nil)
+ checkAnswer(
+ sql("select partcol1, count(distinct partcol2) from srcpart_15752 where partcol1 = 1 " +
+ "group by partcol1"),
+ Row(1, 2) :: Nil)
+ checkAnswer(sql("select distinct partcol1 from srcpart_15752"), Row(0) :: Row(1) :: Nil)
+ checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(
+ sql("select distinct col from (select partcol1 + 1 as col from srcpart_15752 " +
+ "where partcol1 = 1) t"),
+ Row(2))
+ checkAnswer(sql("select distinct partcol1 from srcpart_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(sql("select max(partcol1) from srcpart_15752"), Row(1))
+ checkAnswer(sql("select max(partcol1) from srcpart_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(sql("select max(partcol1) from (select partcol1 from srcpart_15752) t"), Row(1))
+ checkAnswer(
+ sql("select max(col) from (select partcol1 + 1 as col from srcpart_15752 " +
+ "where partcol1 = 1) t"),
+ Row(2))
+
+ sql(
+ """
+ |CREATE TABLE srctext_15752 (col1 INT, col2 STRING)
+ |PARTITIONED BY (partcol1 INT, partcol2 STRING) STORED AS textfile
+ """.stripMargin)
+ for (partcol1 <- Seq(0, 1); partcol2 <- Seq("a", "b")) {
+ sql(
+ s"""
+ |INSERT OVERWRITE TABLE srctext_15752
+ |PARTITION (partcol1='$partcol1', partcol2='$partcol2')
+ |select key, value from data_15752
+ """.stripMargin)
+ }
+ checkAnswer(
+ sql("select partcol1 from srctext_15752 group by partcol1"),
+ Row(0) :: Row(1) :: Nil)
+ checkAnswer(
+ sql("select partcol1 from srctext_15752 where partcol1 = 1 group by partcol1"),
+ Row(1))
+ checkAnswer(
+ sql("select partcol1, count(distinct partcol2) from srctext_15752 group by partcol1"),
+ Row(0, 2) :: Row(1, 2) :: Nil)
+ checkAnswer(
+ sql("select partcol1, count(distinct partcol2) from srctext_15752 where partcol1 = 1 " +
+ "group by partcol1"),
+ Row(1, 2) :: Nil)
+ checkAnswer(sql("select distinct partcol1 from srctext_15752"), Row(0) :: Row(1) :: Nil)
+ checkAnswer(sql("select distinct partcol1 from srctext_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(
+ sql("select distinct col from (select partcol1 + 1 as col from srctext_15752 " +
+ "where partcol1 = 1) t"),
+ Row(2))
+ checkAnswer(sql("select max(partcol1) from srctext_15752"), Row(1))
+ checkAnswer(sql("select max(partcol1) from srctext_15752 where partcol1 = 1"), Row(1))
+ checkAnswer(sql("select max(partcol1) from (select partcol1 from srctext_15752) t"), Row(1))
+ checkAnswer(
+ sql("select max(col) from (select partcol1 + 1 as col from srctext_15752 " +
+ "where partcol1 = 1) t"),
+ Row(2))
+ }
+ }
+ }
}