[SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()

When profiling HiveCompatibilitySuite, I noticed that most of the time seems to be spent in expensive `TestHive.reset()` calls. This patch speeds up suites based on HiveComparisionTest, such as HiveCompatibilitySuite, with the following changes: - Avoid `TestHive.reset()` whenever possible: - Use a simple set of heuristics to guess whether we need to call `reset()` in between tests. - As a safety-net, automatically re-run failed tests by calling `reset()` before the re-attempt. - Speed up the expensive parts of `TestHive.reset()`: loading the `src` and `srcpart` tables took roughly 600ms per test, so we now avoid this by using a simple heuristic which only loads those tables by tests that reference them. This is based on simple string matching over the test queries which errs on the side of loading in more situations than might be strictly necessary. After these changes, HiveCompatibilitySuite seems to run in about 10 minutes. This PR is a revival of #6663, an earlier experimental PR from June, where I played around with several possible speedups for this suite. Author: Josh Rosen <joshrosen@databricks.com> Closes #10055 from JoshRosen/speculative-testhive-reset.
author: Josh Rosen <joshrosen@databricks.com> 2015-12-02 07:29:45 +0800
committer: Reynold Xin <rxin@databricks.com> 2015-12-02 07:29:45 +0800
commit: ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6 (patch)
tree: 8a909ebb7ae6d21d0887d82b426ff641348d88a0 /sql
parent: f292018f8e57779debc04998456ec875f628133b (diff)
download: spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.tar.gz
spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.tar.bz2
spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.zip
3 files changed, 62 insertions, 14 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6883d305cb..2e2d201bf2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -443,13 +443,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       defaultOverrides()
 
       runSqlHive("USE default")
-
-      // Just loading src makes a lot of tests pass.  This is because some tests do something like
-      // drop an index on src at the beginning.  Since we just pass DDL to hive this bypasses our
-      // Analyzer and thus the test table auto-loading mechanism.
-      // Remove after we handle more DDL operations natively.
-      loadTestTable("src")
-      loadTestTable("srcpart")
     } catch {
       case e: Exception =>
         logError("FATAL ERROR: Failed to reset TestDB state.", e)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index aa95ba94fa..4455430aa7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -209,7 +209,11 @@ abstract class HiveComparisonTest
   }
 
   val installHooksCommand = "(?i)SET.*hooks".r
-  def createQueryTest(testCaseName: String, sql: String, reset: Boolean = true) {
+  def createQueryTest(
+      testCaseName: String,
+      sql: String,
+      reset: Boolean = true,
+      tryWithoutResettingFirst: Boolean = false) {
     // testCaseName must not contain ':', which is not allowed to appear in a filename of Windows
     assert(!testCaseName.contains(":"))
 
@@ -240,9 +244,6 @@ abstract class HiveComparisonTest
     test(testCaseName) {
       logDebug(s"=== HIVE TEST: $testCaseName ===")
 
-      // Clear old output for this testcase.
-      outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
-
       val sqlWithoutComment =
         sql.split("\n").filterNot(l => l.matches("--.*(?<=[^\\\\]);")).mkString("\n")
       val allQueries =
@@ -269,11 +270,32 @@ abstract class HiveComparisonTest
         }.mkString("\n== Console version of this test ==\n", "\n", "\n")
       }
 
-      try {
+      def doTest(reset: Boolean, isSpeculative: Boolean = false): Unit = {
+        // Clear old output for this testcase.
+        outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
+
         if (reset) {
           TestHive.reset()
         }
 
+        // Many tests drop indexes on src and srcpart at the beginning, so we need to load those
+        // tables here. Since DROP INDEX DDL is just passed to Hive, it bypasses the analyzer and
+        // thus the tables referenced in those DDL commands cannot be extracted for use by our
+        // test table auto-loading mechanism. In addition, the tests which use the SHOW TABLES
+        // command expect these tables to exist.
+        val hasShowTableCommand = queryList.exists(_.toLowerCase.contains("show tables"))
+        for (table <- Seq("src", "srcpart")) {
+          val hasMatchingQuery = queryList.exists { query =>
+            val normalizedQuery = query.toLowerCase.stripSuffix(";")
+            normalizedQuery.endsWith(table) ||
+              normalizedQuery.contains(s"from $table") ||
+              normalizedQuery.contains(s"from default.$table")
+          }
+          if (hasShowTableCommand || hasMatchingQuery) {
+            TestHive.loadTestTable(table)
+          }
+        }
+
         val hiveCacheFiles = queryList.zipWithIndex.map {
           case (queryString, i) =>
             val cachedAnswerName = s"$testCaseName-$i-${getMd5(queryString)}"
@@ -430,12 +452,45 @@ abstract class HiveComparisonTest
                 """.stripMargin
 
               stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase)
-              fail(errorMessage)
+              if (isSpeculative && !reset) {
+                fail("Failed on first run; retrying")
+              } else {
+                fail(errorMessage)
+              }
             }
         }
 
         // Touch passed file.
         new FileOutputStream(new File(passedDirectory, testCaseName)).close()
+      }
+
+      val canSpeculativelyTryWithoutReset: Boolean = {
+        val excludedSubstrings = Seq(
+          "into table",
+          "create table",
+          "drop index"
+        )
+        !queryList.map(_.toLowerCase).exists { query =>
+          excludedSubstrings.exists(s => query.contains(s))
+        }
+      }
+
+      try {
+        try {
+          if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
+            doTest(reset = false, isSpeculative = true)
+          } else {
+            doTest(reset)
+          }
+        } catch {
+          case tf: org.scalatest.exceptions.TestFailedException =>
+            if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
+              logWarning("Test failed without reset(); retrying with reset()")
+              doTest(reset = true)
+            } else {
+              throw tf
+            }
+        }
       } catch {
         case tf: org.scalatest.exceptions.TestFailedException => throw tf
         case originalException: Exception =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
index f7b37dae0a..f96c989c46 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
@@ -59,7 +59,7 @@ abstract class HiveQueryFileTest extends HiveComparisonTest {
         runAll) {
         // Build a test case and submit it to scala test framework...
         val queriesString = fileToString(testCaseFile)
-        createQueryTest(testCaseName, queriesString)
+        createQueryTest(testCaseName, queriesString, reset = true, tryWithoutResettingFirst = true)
       } else {
         // Only output warnings for the built in whitelist as this clutters the output when the user
         // trying to execute a single test from the commandline.
author	Josh Rosen <joshrosen@databricks.com>	2015-12-02 07:29:45 +0800
committer	Reynold Xin <rxin@databricks.com>	2015-12-02 07:29:45 +0800
commit	ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6 (patch)
tree	8a909ebb7ae6d21d0887d82b426ff641348d88a0 /sql
parent	f292018f8e57779debc04998456ec875f628133b (diff)
download	spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.tar.gz spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.tar.bz2 spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.zip