aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@databricks.com>2015-12-02 07:29:45 +0800
committerReynold Xin <rxin@databricks.com>2015-12-02 07:29:45 +0800
commitef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6 (patch)
tree8a909ebb7ae6d21d0887d82b426ff641348d88a0 /sql
parentf292018f8e57779debc04998456ec875f628133b (diff)
downloadspark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.tar.gz
spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.tar.bz2
spark-ef6790fdc3b70b9d6184ec2b3d926e4b0e4b15f6.zip
[SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()
When profiling HiveCompatibilitySuite, I noticed that most of the time seems to be spent in expensive `TestHive.reset()` calls. This patch speeds up suites based on HiveComparisionTest, such as HiveCompatibilitySuite, with the following changes: - Avoid `TestHive.reset()` whenever possible: - Use a simple set of heuristics to guess whether we need to call `reset()` in between tests. - As a safety-net, automatically re-run failed tests by calling `reset()` before the re-attempt. - Speed up the expensive parts of `TestHive.reset()`: loading the `src` and `srcpart` tables took roughly 600ms per test, so we now avoid this by using a simple heuristic which only loads those tables by tests that reference them. This is based on simple string matching over the test queries which errs on the side of loading in more situations than might be strictly necessary. After these changes, HiveCompatibilitySuite seems to run in about 10 minutes. This PR is a revival of #6663, an earlier experimental PR from June, where I played around with several possible speedups for this suite. Author: Josh Rosen <joshrosen@databricks.com> Closes #10055 from JoshRosen/speculative-testhive-reset.
Diffstat (limited to 'sql')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala7
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala67
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala2
3 files changed, 62 insertions, 14 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index 6883d305cb..2e2d201bf2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -443,13 +443,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
defaultOverrides()
runSqlHive("USE default")
-
- // Just loading src makes a lot of tests pass. This is because some tests do something like
- // drop an index on src at the beginning. Since we just pass DDL to hive this bypasses our
- // Analyzer and thus the test table auto-loading mechanism.
- // Remove after we handle more DDL operations natively.
- loadTestTable("src")
- loadTestTable("srcpart")
} catch {
case e: Exception =>
logError("FATAL ERROR: Failed to reset TestDB state.", e)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
index aa95ba94fa..4455430aa7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala
@@ -209,7 +209,11 @@ abstract class HiveComparisonTest
}
val installHooksCommand = "(?i)SET.*hooks".r
- def createQueryTest(testCaseName: String, sql: String, reset: Boolean = true) {
+ def createQueryTest(
+ testCaseName: String,
+ sql: String,
+ reset: Boolean = true,
+ tryWithoutResettingFirst: Boolean = false) {
// testCaseName must not contain ':', which is not allowed to appear in a filename of Windows
assert(!testCaseName.contains(":"))
@@ -240,9 +244,6 @@ abstract class HiveComparisonTest
test(testCaseName) {
logDebug(s"=== HIVE TEST: $testCaseName ===")
- // Clear old output for this testcase.
- outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
-
val sqlWithoutComment =
sql.split("\n").filterNot(l => l.matches("--.*(?<=[^\\\\]);")).mkString("\n")
val allQueries =
@@ -269,11 +270,32 @@ abstract class HiveComparisonTest
}.mkString("\n== Console version of this test ==\n", "\n", "\n")
}
- try {
+ def doTest(reset: Boolean, isSpeculative: Boolean = false): Unit = {
+ // Clear old output for this testcase.
+ outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
+
if (reset) {
TestHive.reset()
}
+ // Many tests drop indexes on src and srcpart at the beginning, so we need to load those
+ // tables here. Since DROP INDEX DDL is just passed to Hive, it bypasses the analyzer and
+ // thus the tables referenced in those DDL commands cannot be extracted for use by our
+ // test table auto-loading mechanism. In addition, the tests which use the SHOW TABLES
+ // command expect these tables to exist.
+ val hasShowTableCommand = queryList.exists(_.toLowerCase.contains("show tables"))
+ for (table <- Seq("src", "srcpart")) {
+ val hasMatchingQuery = queryList.exists { query =>
+ val normalizedQuery = query.toLowerCase.stripSuffix(";")
+ normalizedQuery.endsWith(table) ||
+ normalizedQuery.contains(s"from $table") ||
+ normalizedQuery.contains(s"from default.$table")
+ }
+ if (hasShowTableCommand || hasMatchingQuery) {
+ TestHive.loadTestTable(table)
+ }
+ }
+
val hiveCacheFiles = queryList.zipWithIndex.map {
case (queryString, i) =>
val cachedAnswerName = s"$testCaseName-$i-${getMd5(queryString)}"
@@ -430,12 +452,45 @@ abstract class HiveComparisonTest
""".stripMargin
stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase)
- fail(errorMessage)
+ if (isSpeculative && !reset) {
+ fail("Failed on first run; retrying")
+ } else {
+ fail(errorMessage)
+ }
}
}
// Touch passed file.
new FileOutputStream(new File(passedDirectory, testCaseName)).close()
+ }
+
+ val canSpeculativelyTryWithoutReset: Boolean = {
+ val excludedSubstrings = Seq(
+ "into table",
+ "create table",
+ "drop index"
+ )
+ !queryList.map(_.toLowerCase).exists { query =>
+ excludedSubstrings.exists(s => query.contains(s))
+ }
+ }
+
+ try {
+ try {
+ if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
+ doTest(reset = false, isSpeculative = true)
+ } else {
+ doTest(reset)
+ }
+ } catch {
+ case tf: org.scalatest.exceptions.TestFailedException =>
+ if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
+ logWarning("Test failed without reset(); retrying with reset()")
+ doTest(reset = true)
+ } else {
+ throw tf
+ }
+ }
} catch {
case tf: org.scalatest.exceptions.TestFailedException => throw tf
case originalException: Exception =>
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
index f7b37dae0a..f96c989c46 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala
@@ -59,7 +59,7 @@ abstract class HiveQueryFileTest extends HiveComparisonTest {
runAll) {
// Build a test case and submit it to scala test framework...
val queriesString = fileToString(testCaseFile)
- createQueryTest(testCaseName, queriesString)
+ createQueryTest(testCaseName, queriesString, reset = true, tryWithoutResettingFirst = true)
} else {
// Only output warnings for the built in whitelist as this clutters the output when the user
// trying to execute a single test from the commandline.