aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NOTICE16
-rwxr-xr-xdev/run-tests.py25
-rw-r--r--dev/sparktestsupport/modules.py54
-rw-r--r--dev/sparktestsupport/toposort.py85
4 files changed, 162 insertions, 18 deletions
diff --git a/NOTICE b/NOTICE
index e416aadce9..6a26155fb4 100644
--- a/NOTICE
+++ b/NOTICE
@@ -650,3 +650,19 @@ For CSV functionality:
*/
+===============================================================================
+For dev/sparktestsupport/toposort.py:
+
+Copyright 2014 True Blade Systems, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 8f47728f20..c78a66f6aa 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -29,6 +29,7 @@ from collections import namedtuple
from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
+from sparktestsupport.toposort import toposort_flatten, toposort
import sparktestsupport.modules as modules
@@ -43,7 +44,7 @@ def determine_modules_for_files(filenames):
If a file is not associated with a more specific submodule, then this method will consider that
file to belong to the 'root' module.
- >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
+ >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
['pyspark-core', 'sql']
>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
['root']
@@ -99,14 +100,16 @@ def determine_modules_to_test(changed_modules):
Given a set of modules that have changed, compute the transitive closure of those modules'
dependent modules in order to determine the set of modules that should be tested.
- >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
+ Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
+
+ >>> [x.name for x in determine_modules_to_test([modules.root])]
['root']
- >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
- ['examples', 'graphx']
- >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+ >>> [x.name for x in determine_modules_to_test([modules.graphx])]
+ ['graphx', 'examples']
+ >>> x = [x.name for x in determine_modules_to_test([modules.sql])]
>>> x # doctest: +NORMALIZE_WHITESPACE
- ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
- 'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
+ ['sql', 'hive', 'mllib', 'examples', 'hive-thriftserver', 'pyspark-sql', 'sparkr',
+ 'pyspark-mllib', 'pyspark-ml']
"""
# If we're going to have to run all of the tests, then we can just short-circuit
# and return 'root'. No module depends on root, so if it appears then it will be
@@ -116,7 +119,9 @@ def determine_modules_to_test(changed_modules):
modules_to_test = set()
for module in changed_modules:
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
- return modules_to_test.union(set(changed_modules))
+ modules_to_test = modules_to_test.union(set(changed_modules))
+ return toposort_flatten(
+ {m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True)
def determine_tags_to_exclude(changed_modules):
@@ -377,12 +382,12 @@ def run_scala_tests_maven(test_profiles):
def run_scala_tests_sbt(test_modules, test_profiles):
- sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
+ sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
if not sbt_test_goals:
return
- profiles_and_goals = test_profiles + list(sbt_test_goals)
+ profiles_and_goals = test_profiles + sbt_test_goals
print("[info] Running Spark tests using SBT with these arguments: ",
" ".join(profiles_and_goals))
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 032c0616ed..07c3078e45 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -15,12 +15,14 @@
# limitations under the License.
#
+from functools import total_ordering
import itertools
import re
all_modules = []
+@total_ordering
class Module(object):
"""
A module is the basic abstraction in our test runner script. Each module consists of a set of
@@ -75,20 +77,56 @@ class Module(object):
def contains_file(self, filename):
return any(re.match(p, filename) for p in self.source_file_prefixes)
+ def __repr__(self):
+ return "Module<%s>" % self.name
+
+ def __lt__(self, other):
+ return self.name < other.name
+
+ def __eq__(self, other):
+ return self.name == other.name
+
+ def __ne__(self, other):
+ return not (self.name == other.name)
+
+ def __hash__(self):
+ return hash(self.name)
+
+
+catalyst = Module(
+ name="catalyst",
+ dependencies=[],
+ source_file_regexes=[
+ "sql/catalyst/",
+ ],
+ sbt_test_goals=[
+ "catalyst/test",
+ ],
+)
+
sql = Module(
name="sql",
- dependencies=[],
+ dependencies=[catalyst],
source_file_regexes=[
- "sql/(?!hive-thriftserver)",
+ "sql/core/",
+ ],
+ sbt_test_goals=[
+ "sql/test",
+ ],
+)
+
+hive = Module(
+ name="hive",
+ dependencies=[sql],
+ source_file_regexes=[
+ "sql/hive/",
"bin/spark-sql",
],
build_profile_flags=[
"-Phive",
],
sbt_test_goals=[
- "catalyst/test",
- "sql/test",
"hive/test",
],
test_tags=[
@@ -99,7 +137,7 @@ sql = Module(
hive_thriftserver = Module(
name="hive-thriftserver",
- dependencies=[sql],
+ dependencies=[hive],
source_file_regexes=[
"sql/hive-thriftserver",
"sbin/start-thriftserver.sh",
@@ -282,7 +320,7 @@ mllib = Module(
examples = Module(
name="examples",
- dependencies=[graphx, mllib, streaming, sql],
+ dependencies=[graphx, mllib, streaming, hive],
source_file_regexes=[
"examples/",
],
@@ -314,7 +352,7 @@ pyspark_core = Module(
pyspark_sql = Module(
name="pyspark-sql",
- dependencies=[pyspark_core, sql],
+ dependencies=[pyspark_core, hive],
source_file_regexes=[
"python/pyspark/sql"
],
@@ -404,7 +442,7 @@ pyspark_ml = Module(
sparkr = Module(
name="sparkr",
- dependencies=[sql, mllib],
+ dependencies=[hive, mllib],
source_file_regexes=[
"R/",
],
diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
new file mode 100644
index 0000000000..6c67b4504b
--- /dev/null
+++ b/dev/sparktestsupport/toposort.py
@@ -0,0 +1,85 @@
+#######################################################################
+# Implements a topological sort algorithm.
+#
+# Copyright 2014 True Blade Systems, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Notes:
+# Based on http://code.activestate.com/recipes/578272-topological-sort
+# with these major changes:
+# Added unittests.
+# Deleted doctests (maybe not the best idea in the world, but it cleans
+# up the docstring).
+# Moved functools import to the top of the file.
+# Changed assert to a ValueError.
+# Changed iter[items|keys] to [items|keys], for python 3
+# compatibility. I don't think it matters for python 2 these are
+# now lists instead of iterables.
+# Copy the input so as to leave it unmodified.
+# Renamed function from toposort2 to toposort.
+# Handle empty input.
+# Switch tests to use set literals.
+#
+########################################################################
+
+from functools import reduce as _reduce
+
+
+__all__ = ['toposort', 'toposort_flatten']
+
+
+def toposort(data):
+ """Dependencies are expressed as a dictionary whose keys are items
+and whose values are a set of dependent items. Output is a list of
+sets in topological order. The first set consists of items with no
+dependences, each subsequent set consists of items that depend upon
+items in the preceeding sets.
+"""
+
+ # Special case empty input.
+ if len(data) == 0:
+ return
+
+ # Copy the input so as to leave it unmodified.
+ data = data.copy()
+
+ # Ignore self dependencies.
+ for k, v in data.items():
+ v.discard(k)
+ # Find all items that don't depend on anything.
+ extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys())
+ # Add empty dependences where needed.
+ data.update({item: set() for item in extra_items_in_deps})
+ while True:
+ ordered = set(item for item, dep in data.items() if len(dep) == 0)
+ if not ordered:
+ break
+ yield ordered
+ data = {item: (dep - ordered)
+ for item, dep in data.items()
+ if item not in ordered}
+ if len(data) != 0:
+ raise ValueError('Cyclic dependencies exist among these items: {}'.format(
+ ', '.join(repr(x) for x in data.items())))
+
+
+def toposort_flatten(data, sort=True):
+ """Returns a single list of dependencies. For any set returned by
+toposort(), those items are sorted and appended to the result (just to
+make the results deterministic)."""
+
+ result = []
+ for d in toposort(data):
+ result.extend((sorted if sort else list)(d))
+ return result