diff options
author | Wenchen Fan <wenchen@databricks.com> | 2017-01-19 20:09:48 -0800 |
---|---|---|
committer | gatorsmile <gatorsmile@gmail.com> | 2017-01-19 20:09:48 -0800 |
commit | 0bf605c2c67ca361cd4aa3a3b4492bef4aef76b9 (patch) | |
tree | 2f7bd70db8c44a5c02b769beeafb789686a0da13 /sql/hive | |
parent | 148a84b37082697c7f61c6a621010abe4b12f2eb (diff) | |
download | spark-0bf605c2c67ca361cd4aa3a3b4492bef4aef76b9.tar.gz spark-0bf605c2c67ca361cd4aa3a3b4492bef4aef76b9.tar.bz2 spark-0bf605c2c67ca361cd4aa3a3b4492bef4aef76b9.zip |
[SPARK-19292][SQL] filter with partition columns should be case-insensitive on Hive tables
## What changes were proposed in this pull request?
When we query a table with a filter on partitioned columns, we will push the partition filter to the metastore to get matched partitions directly.
In `HiveExternalCatalog.listPartitionsByFilter`, we assume the column names in partition filter are already normalized and we don't need to consider case sensitivity. However, `HiveTableScanExec` doesn't follow this assumption. This PR fixes it.
## How was this patch tested?
new regression test
Author: Wenchen Fan <wenchen@databricks.com>
Closes #16647 from cloud-fan/bug.
Diffstat (limited to 'sql/hive')
-rw-r--r-- | sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala | 12 | ||||
-rw-r--r-- | sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala | 13 |
2 files changed, 24 insertions, 1 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala index 7ee5fc543c..def6ef3691 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala @@ -146,9 +146,19 @@ case class HiveTableScanExec( hadoopReader.makeRDDForTable(relation.hiveQlTable) } } else { + // The attribute name of predicate could be different than the one in schema in case of + // case insensitive, we should change them to match the one in schema, so we do not need to + // worry about case sensitivity anymore. + val normalizedFilters = partitionPruningPred.map { e => + e transform { + case a: AttributeReference => + a.withName(relation.output.find(_.semanticEquals(a)).get.name) + } + } + Utils.withDummyCallSite(sqlContext.sparkContext) { hadoopReader.makeRDDForPartitionedTable( - prunePartitions(relation.getHiveQlPartitions(partitionPruningPred))) + prunePartitions(relation.getHiveQlPartitions(normalizedFilters))) } } val numOutputRows = longMetric("numOutputRows") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 104b5250b6..1a28c4c84a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2014,4 +2014,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { ) } } + + test("SPARK-19292: filter with partition columns should be case-insensitive on Hive tables") { + withTable("tbl") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql("CREATE TABLE tbl(i int, j int) USING hive PARTITIONED BY (j)") + sql("INSERT INTO tbl PARTITION(j=10) SELECT 1") + checkAnswer(spark.table("tbl"), Row(1, 10)) + + checkAnswer(sql("SELECT i, j FROM tbl WHERE J=10"), Row(1, 10)) + checkAnswer(spark.table("tbl").filter($"J" === 10), Row(1, 10)) + } + } + } } |