[SPARK-18545][SQL] Verify number of hive client RPCs in PartitionedTablePerfStatsSuite

## What changes were proposed in this pull request? This would help catch accidental O(n) calls to the hive client as in https://issues.apache.org/jira/browse/SPARK-18507 ## How was this patch tested? Checked that the test fails before https://issues.apache.org/jira/browse/SPARK-18507 was patched. cc cloud-fan Author: Eric Liang <ekl@databricks.com> Closes #15985 from ericl/spark-18545.
author: Eric Liang <ekl@databricks.com> 2016-11-23 20:14:08 +0800
committer: Wenchen Fan <wenchen@databricks.com> 2016-11-23 20:14:08 +0800
commit: 85235ed6c600270e3fa434738bd50dce3564440a (patch)
tree: 77d7f7bcd2b5e30369aa1bb9bb4b5f0cdecd3f02 /sql/hive/src/test
parent: 7e0cd1d9b168286386f15e9b55988733476ae2bb (diff)
download: spark-85235ed6c600270e3fa434738bd50dce3564440a.tar.gz
spark-85235ed6c600270e3fa434738bd50dce3564440a.tar.bz2
spark-85235ed6c600270e3fa434738bd50dce3564440a.zip
1 files changed, 56 insertions, 2 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
index b41bc862e9..9838b9a4eb 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -57,7 +57,11 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedHiveTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+    setupPartitionedHiveTable(tableName, dir, 5)
+  }
+
+  private def setupPartitionedHiveTable(tableName: String, dir: File, scale: Int): Unit = {
+    spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
@@ -71,7 +75,11 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedDatasourceTable(tableName: String, dir: File): Unit = {
-    spark.range(5).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
+    setupPartitionedDatasourceTable(tableName, dir, 5)
+  }
+
+  private def setupPartitionedDatasourceTable(tableName: String, dir: File, scale: Int): Unit = {
+    spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
@@ -242,6 +250,52 @@ class PartitionedTablePerfStatsSuite
     }
   }
 
+  test("hive table: num hive client calls does not scale with partition count") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedHiveTable("test", dir, scale = 100)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("show partitions test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+        }
+      }
+    }
+  }
+
+  test("datasource table: num hive client calls does not scale with partition count") {
+    withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
+      withTable("test") {
+        withTempDir { dir =>
+          setupPartitionedDatasourceTable("test", dir, scale = 100)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test where partCol1 = 1").count() == 1)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() > 0)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("select * from test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+
+          HiveCatalogMetrics.reset()
+          assert(spark.sql("show partitions test").count() == 100)
+          assert(HiveCatalogMetrics.METRIC_HIVE_CLIENT_CALLS.getCount() < 10)
+        }
+      }
+    }
+  }
+
   test("hive table: files read and cached when filesource partition management is off") {
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "false") {
       withTable("test") {
author	Eric Liang <ekl@databricks.com>	2016-11-23 20:14:08 +0800
committer	Wenchen Fan <wenchen@databricks.com>	2016-11-23 20:14:08 +0800
commit	85235ed6c600270e3fa434738bd50dce3564440a (patch)
tree	77d7f7bcd2b5e30369aa1bb9bb4b5f0cdecd3f02 /sql/hive/src/test
parent	7e0cd1d9b168286386f15e9b55988733476ae2bb (diff)
download	spark-85235ed6c600270e3fa434738bd50dce3564440a.tar.gz spark-85235ed6c600270e3fa434738bd50dce3564440a.tar.bz2 spark-85235ed6c600270e3fa434738bd50dce3564440a.zip