aboutsummaryrefslogtreecommitdiff
path: root/sql/hive/src/test
diff options
context:
space:
mode:
authorEric Liang <ekl@databricks.com>2016-11-23 20:14:08 +0800
committerWenchen Fan <wenchen@databricks.com>2016-11-23 20:14:08 +0800
commit85235ed6c600270e3fa434738bd50dce3564440a (patch)
tree77d7f7bcd2b5e30369aa1bb9bb4b5f0cdecd3f02 /sql/hive/src/test
parent7e0cd1d9b168286386f15e9b55988733476ae2bb (diff)
downloadspark-85235ed6c600270e3fa434738bd50dce3564440a.tar.gz
spark-85235ed6c600270e3fa434738bd50dce3564440a.tar.bz2
spark-85235ed6c600270e3fa434738bd50dce3564440a.zip
[SPARK-18545][SQL] Verify number of hive client RPCs in PartitionedTablePerfStatsSuite
## What changes were proposed in this pull request? This would help catch accidental O(n) calls to the hive client as in https://issues.apache.org/jira/browse/SPARK-18507 ## How was this patch tested? Checked that the test fails before https://issues.apache.org/jira/browse/SPARK-18507 was patched. cc cloud-fan Author: Eric Liang <ekl@databricks.com> Closes #15985 from ericl/spark-18545.
Diffstat (limited to 'sql/hive/src/test')
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala58
1 files changed, 56 insertions, 2 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index b41bc862e9..9838b9a4eb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -57,7 +57,11 @@ class PartitionedTablePerfStatsSuite
}
private def setupPartitionedHiveTable(tableName: String, dir: File): Unit = {
- spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+ setupPartitionedHiveTable(tableName, dir, 5)
+ }
+
+ private def setupPartitionedHiveTable(tableName: String, dir: File, scale: Int): Unit = {
+ spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
.partitionBy("partCol1", "partCol2")
.mode("overwrite")
.parquet(dir.getAbsolutePath)
@@ -71,7 +75,11 @@ class PartitionedTablePerfStatsSuite
}
private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
- spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+ setupPartitionedDatasourceTable(tableName, dir, 5)
+ }
+
+ private def setupPartitionedDatasourceTable(tableName: String, dir: File, scale: Int): Unit = {
+ spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
.partitionBy("partCol1", "partCol2")
.mode("overwrite")
.parquet(dir.getAbsolutePath)
@@ -242,6 +250,52 @@ class PartitionedTablePerfStatsSuite
}
}
+ test("hive table: num hive client calls does not scale with partition count") {
+ withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+ withTable("test") {
+ withTempDir { dir =>
+ setupPartitionedHiveTable("test", dir, scale = 100)
+
+ HiveCatalogMetrics.reset()
+ assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+ HiveCatalogMetrics.reset()
+ assert(spark.sql("select * from test").count() == 100)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+ HiveCatalogMetrics.reset()
+ assert(spark.sql("show partitions test").count() == 100)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+ }
+ }
+ }
+ }
+
+ test("datasource table: num hive client calls does not scale with partition count") {
+ withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+ withTable("test") {
+ withTempDir { dir =>
+ setupPartitionedDatasourceTable("test", dir, scale = 100)
+
+ HiveCatalogMetrics.reset()
+ assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+ HiveCatalogMetrics.reset()
+ assert(spark.sql("select * from test").count() == 100)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+ HiveCatalogMetrics.reset()
+ assert(spark.sql("show partitions test").count() == 100)
+ assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+ }
+ }
+ }
+ }
+
test("hive table: files read and cached when filesource partition management is off") {
withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
withTable("test") {