aboutsummaryrefslogtreecommitdiff
path: root/dev/sparktestsupport
diff options
context:
space:
mode:
Diffstat (limited to 'dev/sparktestsupport')
-rw-r--r--dev/sparktestsupport/modules.py54
-rw-r--r--dev/sparktestsupport/toposort.py85
2 files changed, 131 insertions, 8 deletions
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 032c0616ed..07c3078e45 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -15,12 +15,14 @@
# limitations under the License.
#
+from functools import total_ordering
import itertools
import re
all_modules = []
+@total_ordering
class Module(object):
"""
A module is the basic abstraction in our test runner script. Each module consists of a set of
@@ -75,20 +77,56 @@ class Module(object):
def contains_file(self, filename):
return any(re.match(p, filename) for p in self.source_file_prefixes)
+ def __repr__(self):
+ return "Module<%s>" % self.name
+
+ def __lt__(self, other):
+ return self.name < other.name
+
+ def __eq__(self, other):
+ return self.name == other.name
+
+ def __ne__(self, other):
+ return not (self.name == other.name)
+
+ def __hash__(self):
+ return hash(self.name)
+
+
+catalyst = Module(
+ name="catalyst",
+ dependencies=[],
+ source_file_regexes=[
+ "sql/catalyst/",
+ ],
+ sbt_test_goals=[
+ "catalyst/test",
+ ],
+)
+
sql = Module(
name="sql",
- dependencies=[],
+ dependencies=[catalyst],
source_file_regexes=[
- "sql/(?!hive-thriftserver)",
+ "sql/core/",
+ ],
+ sbt_test_goals=[
+ "sql/test",
+ ],
+)
+
+hive = Module(
+ name="hive",
+ dependencies=[sql],
+ source_file_regexes=[
+ "sql/hive/",
"bin/spark-sql",
],
build_profile_flags=[
"-Phive",
],
sbt_test_goals=[
- "catalyst/test",
- "sql/test",
"hive/test",
],
test_tags=[
@@ -99,7 +137,7 @@ sql = Module(
hive_thriftserver = Module(
name="hive-thriftserver",
- dependencies=[sql],
+ dependencies=[hive],
source_file_regexes=[
"sql/hive-thriftserver",
"sbin/start-thriftserver.sh",
@@ -282,7 +320,7 @@ mllib = Module(
examples = Module(
name="examples",
- dependencies=[graphx, mllib, streaming, sql],
+ dependencies=[graphx, mllib, streaming, hive],
source_file_regexes=[
"examples/",
],
@@ -314,7 +352,7 @@ pyspark_core = Module(
pyspark_sql = Module(
name="pyspark-sql",
- dependencies=[pyspark_core, sql],
+ dependencies=[pyspark_core, hive],
source_file_regexes=[
"python/pyspark/sql"
],
@@ -404,7 +442,7 @@ pyspark_ml = Module(
sparkr = Module(
name="sparkr",
- dependencies=[sql, mllib],
+ dependencies=[hive, mllib],
source_file_regexes=[
"R/",
],
diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
new file mode 100644
index 0000000000..6c67b4504b
--- /dev/null
+++ b/dev/sparktestsupport/toposort.py
@@ -0,0 +1,85 @@
+#######################################################################
+# Implements a topological sort algorithm.
+#
+# Copyright 2014 True Blade Systems, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Notes:
+# Based on http://code.activestate.com/recipes/578272-topological-sort
+# with these major changes:
+# Added unittests.
+# Deleted doctests (maybe not the best idea in the world, but it cleans
+# up the docstring).
+# Moved functools import to the top of the file.
+# Changed assert to a ValueError.
+# Changed iter[items|keys] to [items|keys], for python 3
+# compatibility. I don't think it matters for python 2 these are
+# now lists instead of iterables.
+# Copy the input so as to leave it unmodified.
+# Renamed function from toposort2 to toposort.
+# Handle empty input.
+# Switch tests to use set literals.
+#
+########################################################################
+
+from functools import reduce as _reduce
+
+
+__all__ = ['toposort', 'toposort_flatten']
+
+
+def toposort(data):
+ """Dependencies are expressed as a dictionary whose keys are items
+and whose values are a set of dependent items. Output is a list of
+sets in topological order. The first set consists of items with no
+dependences, each subsequent set consists of items that depend upon
+items in the preceeding sets.
+"""
+
+ # Special case empty input.
+ if len(data) == 0:
+ return
+
+ # Copy the input so as to leave it unmodified.
+ data = data.copy()
+
+ # Ignore self dependencies.
+ for k, v in data.items():
+ v.discard(k)
+ # Find all items that don't depend on anything.
+ extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys())
+ # Add empty dependences where needed.
+ data.update({item: set() for item in extra_items_in_deps})
+ while True:
+ ordered = set(item for item, dep in data.items() if len(dep) == 0)
+ if not ordered:
+ break
+ yield ordered
+ data = {item: (dep - ordered)
+ for item, dep in data.items()
+ if item not in ordered}
+ if len(data) != 0:
+ raise ValueError('Cyclic dependencies exist among these items: {}'.format(
+ ', '.join(repr(x) for x in data.items())))
+
+
+def toposort_flatten(data, sort=True):
+ """Returns a single list of dependencies. For any set returned by
+toposort(), those items are sorted and appended to the result (just to
+make the results deterministic)."""
+
+ result = []
+ for d in toposort(data):
+ result.extend((sorted if sort else list)(d))
+ return result