[SPARK-8725][PROJECT-INFRA] Test modules in topologically-sorted order in dev/run-tests

This patch improves our `dev/run-tests` script to test modules in a topologically-sorted order based on modules' dependencies. This will help to ensure that bugs in upstream projects are not misattributed to downstream projects because those projects' tests were the first ones to exhibit the failure Topological sorting is also useful for shortening the feedback loop when testing pull requests: if I make a change in SQL then the SQL tests should run before MLlib, not after. In addition, this patch also updates our test module definitions to split `sql` into `catalyst`, `sql`, and `hive` in order to allow more tests to be skipped when changing only `hive/` files. Author: Josh Rosen <joshrosen@databricks.com> Closes #10885 from JoshRosen/SPARK-8725.
author: Josh Rosen <joshrosen@databricks.com> 2016-01-26 14:20:11 -0800
committer: Josh Rosen <joshrosen@databricks.com> 2016-01-26 14:20:11 -0800
commit: ee74498de372b16fe6350e3617e9e6ec87c6ae7b (patch)
tree: 0adf34b8e4c9421d79b04988b4c39e8715e6a5f6 /dev/run-tests.py
parent: fbf7623d49525e3aa6b08f482afd7ee8118d80cb (diff)
download: spark-ee74498de372b16fe6350e3617e9e6ec87c6ae7b.tar.gz
spark-ee74498de372b16fe6350e3617e9e6ec87c6ae7b.tar.bz2
spark-ee74498de372b16fe6350e3617e9e6ec87c6ae7b.zip
1 files changed, 15 insertions, 10 deletions
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 8f47728f20..c78a66f6aa 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -29,6 +29,7 @@ from collections import namedtuple
 
 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
+from sparktestsupport.toposort import toposort_flatten, toposort
 import sparktestsupport.modules as modules
 
 
@@ -43,7 +44,7 @@ def determine_modules_for_files(filenames):
     If a file is not associated with a more specific submodule, then this method will consider that
     file to belong to the 'root' module.
 
-    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
+    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
     ['pyspark-core', 'sql']
     >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
     ['root']
@@ -99,14 +100,16 @@ def determine_modules_to_test(changed_modules):
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.
 
-    >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
+    Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
+
+    >>> [x.name for x in determine_modules_to_test([modules.root])]
     ['root']
-    >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
-    ['examples', 'graphx']
-    >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+    >>> [x.name for x in determine_modules_to_test([modules.graphx])]
+    ['graphx', 'examples']
+    >>> x = [x.name for x in determine_modules_to_test([modules.sql])]
     >>> x # doctest: +NORMALIZE_WHITESPACE
-    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
-     'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
+    ['sql', 'hive', 'mllib', 'examples', 'hive-thriftserver', 'pyspark-sql', 'sparkr',
+     'pyspark-mllib', 'pyspark-ml']
     """
     # If we're going to have to run all of the tests, then we can just short-circuit
     # and return 'root'. No module depends on root, so if it appears then it will be
@@ -116,7 +119,9 @@ def determine_modules_to_test(changed_modules):
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
-    return modules_to_test.union(set(changed_modules))
+    modules_to_test = modules_to_test.union(set(changed_modules))
+    return toposort_flatten(
+        {m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True)
 
 
 def determine_tags_to_exclude(changed_modules):
@@ -377,12 +382,12 @@ def run_scala_tests_maven(test_profiles):
 
 def run_scala_tests_sbt(test_modules, test_profiles):
 
-    sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
+    sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
 
     if not sbt_test_goals:
         return
 
-    profiles_and_goals = test_profiles + list(sbt_test_goals)
+    profiles_and_goals = test_profiles + sbt_test_goals
 
     print("[info] Running Spark tests using SBT with these arguments: ",
           " ".join(profiles_and_goals))
author	Josh Rosen <joshrosen@databricks.com>	2016-01-26 14:20:11 -0800
committer	Josh Rosen <joshrosen@databricks.com>	2016-01-26 14:20:11 -0800
commit	ee74498de372b16fe6350e3617e9e6ec87c6ae7b (patch)
tree	0adf34b8e4c9421d79b04988b4c39e8715e6a5f6 /dev/run-tests.py
parent	fbf7623d49525e3aa6b08f482afd7ee8118d80cb (diff)
download	spark-ee74498de372b16fe6350e3617e9e6ec87c6ae7b.tar.gz spark-ee74498de372b16fe6350e3617e9e6ec87c6ae7b.tar.bz2 spark-ee74498de372b16fe6350e3617e9e6ec87c6ae7b.zip