From d9eb4c7215f26dd05527c0b9980af35087ab9d64 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 4 Dec 2016 20:44:04 +0800 Subject: [SPARK-18661][SQL] Creating a partitioned datasource table should not scan all files for table ## What changes were proposed in this pull request? Even though in 2.1 creating a partitioned datasource table will not populate the partition data by default (until the user issues MSCK REPAIR TABLE), it seems we still scan the filesystem for no good reason. We should avoid doing this when the user specifies a schema. ## How was this patch tested? Perf stat tests. Author: Eric Liang Closes #16090 from ericl/spark-18661. --- .../sql/hive/PartitionedTablePerfStatsSuite.scala | 51 ++++++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'sql/hive/src/test') diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala index 9838b9a4eb..65c02d473b 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala @@ -60,36 +60,52 @@ class PartitionedTablePerfStatsSuite setupPartitionedHiveTable(tableName, dir, 5) } - private def setupPartitionedHiveTable(tableName: String, dir: File, scale: Int): Unit = { + private def setupPartitionedHiveTable( + tableName: String, dir: File, scale: Int, + clearMetricsBeforeCreate: Boolean = false, repair: Boolean = true): Unit = { spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write .partitionBy("partCol1", "partCol2") .mode("overwrite") .parquet(dir.getAbsolutePath) + if (clearMetricsBeforeCreate) { + HiveCatalogMetrics.reset() + } + spark.sql(s""" |create external table $tableName (fieldOne long) |partitioned by (partCol1 int, partCol2 int) |stored as parquet |location "${dir.getAbsolutePath}"""".stripMargin) - spark.sql(s"msck repair table $tableName") + if (repair) { + spark.sql(s"msck repair table $tableName") + } } private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = { setupPartitionedDatasourceTable(tableName, dir, 5) } - private def setupPartitionedDatasourceTable(tableName: String, dir: File, scale: Int): Unit = { + private def setupPartitionedDatasourceTable( + tableName: String, dir: File, scale: Int, + clearMetricsBeforeCreate: Boolean = false, repair: Boolean = true): Unit = { spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write .partitionBy("partCol1", "partCol2") .mode("overwrite") .parquet(dir.getAbsolutePath) + if (clearMetricsBeforeCreate) { + HiveCatalogMetrics.reset() + } + spark.sql(s""" |create table $tableName (fieldOne long, partCol1 int, partCol2 int) |using parquet |options (path "${dir.getAbsolutePath}") |partitioned by (partCol1, partCol2)""".stripMargin) - spark.sql(s"msck repair table $tableName") + if (repair) { + spark.sql(s"msck repair table $tableName") + } } genericTest("partitioned pruned table reports only selected files") { spec => @@ -250,6 +266,33 @@ class PartitionedTablePerfStatsSuite } } + test("datasource table: table setup does not scan filesystem") { + withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { + withTable("test") { + withTempDir { dir => + setupPartitionedDatasourceTable( + "test", dir, scale = 10, clearMetricsBeforeCreate = true, repair = false) + assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0) + assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0) + } + } + } + } + + test("hive table: table setup does not scan filesystem") { + withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { + withTable("test") { + withTempDir { dir => + HiveCatalogMetrics.reset() + setupPartitionedHiveTable( + "test", dir, scale = 10, clearMetricsBeforeCreate = true, repair = false) + assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0) + assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0) + } + } + } + } + test("hive table: num hive client calls does not scale with partition count") { withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") { withTable("test") { -- cgit v1.2.3