diff options
author | Takuya UESHIN <ueshin@happy-camper.st> | 2017-03-03 16:35:54 -0800 |
---|---|---|
committer | Wenchen Fan <wenchen@databricks.com> | 2017-03-03 16:35:54 -0800 |
commit | 2a7921a813ecd847fd933ffef10edc64684e9df7 (patch) | |
tree | e13297965da232e7e60c4ebc2f1e9ce050f11851 /sql/hive | |
parent | ba186a841fcfcd73a1530ca2418cc08bb0df92e1 (diff) | |
download | spark-2a7921a813ecd847fd933ffef10edc64684e9df7.tar.gz spark-2a7921a813ecd847fd933ffef10edc64684e9df7.tar.bz2 spark-2a7921a813ecd847fd933ffef10edc64684e9df7.zip |
[SPARK-18939][SQL] Timezone support in partition values.
## What changes were proposed in this pull request?
This is a follow-up pr of #16308 and #16750.
This pr enables timezone support in partition values.
We should use `timeZone` option introduced at #16750 to parse/format partition values of the `TimestampType`.
For example, if you have timestamp `"2016-01-01 00:00:00"` in `GMT` which will be used for partition values, the values written by the default timezone option, which is `"GMT"` because the session local timezone is `"GMT"` here, are:
```scala
scala> spark.conf.set("spark.sql.session.timeZone", "GMT")
scala> val df = Seq((1, new java.sql.Timestamp(1451606400000L))).toDF("i", "ts")
df: org.apache.spark.sql.DataFrame = [i: int, ts: timestamp]
scala> df.show()
+---+-------------------+
| i| ts|
+---+-------------------+
| 1|2016-01-01 00:00:00|
+---+-------------------+
scala> df.write.partitionBy("ts").save("/path/to/gmtpartition")
```
```sh
$ ls /path/to/gmtpartition/
_SUCCESS ts=2016-01-01 00%3A00%3A00
```
whereas setting the option to `"PST"`, they are:
```scala
scala> df.write.option("timeZone", "PST").partitionBy("ts").save("/path/to/pstpartition")
```
```sh
$ ls /path/to/pstpartition/
_SUCCESS ts=2015-12-31 16%3A00%3A00
```
We can properly read the partition values if the session local timezone and the timezone of the partition values are the same:
```scala
scala> spark.read.load("/path/to/gmtpartition").show()
+---+-------------------+
| i| ts|
+---+-------------------+
| 1|2016-01-01 00:00:00|
+---+-------------------+
```
And even if the timezones are different, we can properly read the values with setting corrent timezone option:
```scala
// wrong result
scala> spark.read.load("/path/to/pstpartition").show()
+---+-------------------+
| i| ts|
+---+-------------------+
| 1|2015-12-31 16:00:00|
+---+-------------------+
// correct result
scala> spark.read.option("timeZone", "PST").load("/path/to/pstpartition").show()
+---+-------------------+
| i| ts|
+---+-------------------+
| 1|2016-01-01 00:00:00|
+---+-------------------+
```
## How was this patch tested?
Existing tests and added some tests.
Author: Takuya UESHIN <ueshin@happy-camper.st>
Closes #17053 from ueshin/issues/SPARK-18939.
Diffstat (limited to 'sql/hive')
3 files changed, 9 insertions, 5 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 50bb44f7d4..43d9c2bec6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.ColumnStat -import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.PartitioningUtils import org.apache.spark.sql.hive.client.HiveClient @@ -1008,7 +1008,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat override def listPartitionsByFilter( db: String, table: String, - predicates: Seq[Expression]): Seq[CatalogTablePartition] = withClient { + predicates: Seq[Expression], + defaultTimeZoneId: String): Seq[CatalogTablePartition] = withClient { val rawTable = getRawTable(db, table) val catalogTable = restoreTableMetadata(rawTable) val partitionColumnNames = catalogTable.partitionColumnNames.toSet @@ -1034,7 +1035,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat val index = partitionSchema.indexWhere(_.name == att.name) BoundReference(index, partitionSchema(index).dataType, nullable = true) }) - clientPrunedPartitions.filter { p => boundPredicate(p.toRow(partitionSchema)) } + clientPrunedPartitions.filter { p => + boundPredicate(p.toRow(partitionSchema, defaultTimeZoneId)) + } } else { client.getPartitions(catalogTable).map { part => part.copy(spec = restorePartitionSpec(part.spec, partColNameMap)) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala index 14b9565be0..28f074849c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala @@ -163,7 +163,8 @@ case class HiveTableScanExec( sparkSession.sharedState.externalCatalog.listPartitionsByFilter( relation.tableMeta.database, relation.tableMeta.identifier.table, - normalizedFilters) + normalizedFilters, + sparkSession.sessionState.conf.sessionLocalTimeZone) } else { sparkSession.sharedState.externalCatalog.listPartitions( relation.tableMeta.database, diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala index a60c210b04..4349f1aa23 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala @@ -52,7 +52,7 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite { test("list partitions by filter") { val catalog = newBasicCatalog() - val selectedPartitions = catalog.listPartitionsByFilter("db2", "tbl2", Seq('a.int === 1)) + val selectedPartitions = catalog.listPartitionsByFilter("db2", "tbl2", Seq('a.int === 1), "GMT") assert(selectedPartitions.length == 1) assert(selectedPartitions.head.spec == part1.spec) } |