2 files changed, 131 insertions, 8 deletions
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 032c0616ed..07c3078e45 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -15,12 +15,14 @@
 # limitations under the License.
 #
 
+from functools import total_ordering
 import itertools
 import re
 
 all_modules = []
 
 
+@total_ordering
 class Module(object):
     """
     A module is the basic abstraction in our test runner script. Each module consists of a set of
@@ -75,20 +77,56 @@ class Module(object):
     def contains_file(self, filename):
         return any(re.match(p, filename) for p in self.source_file_prefixes)
 
+    def __repr__(self):
+        return "Module<%s>" % self.name
+
+    def __lt__(self, other):
+        return self.name < other.name
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __ne__(self, other):
+        return not (self.name == other.name)
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+catalyst = Module(
+    name="catalyst",
+    dependencies=[],
+    source_file_regexes=[
+        "sql/catalyst/",
+    ],
+    sbt_test_goals=[
+        "catalyst/test",
+    ],
+)
+
 
 sql = Module(
     name="sql",
-    dependencies=[],
+    dependencies=[catalyst],
     source_file_regexes=[
-        "sql/(?!hive-thriftserver)",
+        "sql/core/",
+    ],
+    sbt_test_goals=[
+        "sql/test",
+    ],
+)
+
+hive = Module(
+    name="hive",
+    dependencies=[sql],
+    source_file_regexes=[
+        "sql/hive/",
         "bin/spark-sql",
     ],
     build_profile_flags=[
         "-Phive",
     ],
     sbt_test_goals=[
-        "catalyst/test",
-        "sql/test",
         "hive/test",
     ],
     test_tags=[
@@ -99,7 +137,7 @@ sql = Module(
 
 hive_thriftserver = Module(
     name="hive-thriftserver",
-    dependencies=[sql],
+    dependencies=[hive],
     source_file_regexes=[
         "sql/hive-thriftserver",
         "sbin/start-thriftserver.sh",
@@ -282,7 +320,7 @@ mllib = Module(
 
 examples = Module(
     name="examples",
-    dependencies=[graphx, mllib, streaming, sql],
+    dependencies=[graphx, mllib, streaming, hive],
     source_file_regexes=[
         "examples/",
     ],
@@ -314,7 +352,7 @@ pyspark_core = Module(
 
 pyspark_sql = Module(
     name="pyspark-sql",
-    dependencies=[pyspark_core, sql],
+    dependencies=[pyspark_core, hive],
     source_file_regexes=[
         "python/pyspark/sql"
     ],
@@ -404,7 +442,7 @@ pyspark_ml = Module(
 
 sparkr = Module(
     name="sparkr",
-    dependencies=[sql, mllib],
+    dependencies=[hive, mllib],
     source_file_regexes=[
         "R/",
     ],
diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
new file mode 100644
index 0000000000..6c67b4504b
--- /dev/null
+++ b/dev/sparktestsupport/toposort.py
@@ -0,0 +1,85 @@
+#######################################################################
+# Implements a topological sort algorithm.
+#
+# Copyright 2014 True Blade Systems, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Notes:
+#  Based on http://code.activestate.com/recipes/578272-topological-sort
+#   with these major changes:
+#    Added unittests.
+#    Deleted doctests (maybe not the best idea in the world, but it cleans
+#     up the docstring).
+#    Moved functools import to the top of the file.
+#    Changed assert to a ValueError.
+#    Changed iter[items|keys] to [items|keys], for python 3
+#     compatibility. I don't think it matters for python 2 these are
+#     now lists instead of iterables.
+#    Copy the input so as to leave it unmodified.
+#    Renamed function from toposort2 to toposort.
+#    Handle empty input.
+#    Switch tests to use set literals.
+#
+########################################################################
+
+from functools import reduce as _reduce
+
+
+__all__ = ['toposort', 'toposort_flatten']
+
+
+def toposort(data):
+    """Dependencies are expressed as a dictionary whose keys are items
+and whose values are a set of dependent items. Output is a list of
+sets in topological order. The first set consists of items with no
+dependences, each subsequent set consists of items that depend upon
+items in the preceeding sets.
+"""
+
+    # Special case empty input.
+    if len(data) == 0:
+        return
+
+    # Copy the input so as to leave it unmodified.
+    data = data.copy()
+
+    # Ignore self dependencies.
+    for k, v in data.items():
+        v.discard(k)
+    # Find all items that don't depend on anything.
+    extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys())
+    # Add empty dependences where needed.
+    data.update({item: set() for item in extra_items_in_deps})
+    while True:
+        ordered = set(item for item, dep in data.items() if len(dep) == 0)
+        if not ordered:
+            break
+        yield ordered
+        data = {item: (dep - ordered)
+                for item, dep in data.items()
+                if item not in ordered}
+    if len(data) != 0:
+        raise ValueError('Cyclic dependencies exist among these items: {}'.format(
+            ', '.join(repr(x) for x in data.items())))
+
+
+def toposort_flatten(data, sort=True):
+    """Returns a single list of dependencies. For any set returned by
+toposort(), those items are sorted and appended to the result (just to
+make the results deterministic)."""
+
+    result = []
+    for d in toposort(data):
+        result.extend((sorted if sort else list)(d))
+    return result