1 files changed, 58 insertions, 377 deletions
diff --git a/dev/run-tests.py b/dev/run-tests.py
index e7c09b0f40..c51b0d3010 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -17,297 +17,23 @@
 # limitations under the License.
 #
 
+from __future__ import print_function
 import itertools
 import os
 import re
 import sys
-import shutil
 import subprocess
 from collections import namedtuple
 
-SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
-USER_HOME = os.environ.get("HOME")
-
+from sparktestsupport import SPARK_HOME, USER_HOME
+from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
+import sparktestsupport.modules as modules
 
 # -------------------------------------------------------------------------------------------------
-# Test module definitions and functions for traversing module dependency graph
+# Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------
 
 
-all_modules = []
-
-
-class Module(object):
-    """
-    A module is the basic abstraction in our test runner script. Each module consists of a set of
-    source files, a set of test commands, and a set of dependencies on other modules. We use modules
-    to define a dependency graph that lets determine which tests to run based on which files have
-    changed.
-    """
-
-    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
-                 sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False):
-        """
-        Define a new module.
-
-        :param name: A short module name, for display in logging and error messages.
-        :param dependencies: A set of dependencies for this module. This should only include direct
-            dependencies; transitive dependencies are resolved automatically.
-        :param source_file_regexes: a set of regexes that match source files belonging to this
-            module. These regexes are applied by attempting to match at the beginning of the
-            filename strings.
-        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
-            order to build and test this module (e.g. '-PprofileName').
-        :param sbt_test_goals: A set of SBT test goals for testing this module.
-        :param should_run_python_tests: If true, changes in this module will trigger Python tests.
-            For now, this has the effect of causing _all_ Python tests to be run, although in the
-            future this should be changed to run only a subset of the Python tests that depend
-            on this module.
-        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
-        """
-        self.name = name
-        self.dependencies = dependencies
-        self.source_file_prefixes = source_file_regexes
-        self.sbt_test_goals = sbt_test_goals
-        self.build_profile_flags = build_profile_flags
-        self.should_run_python_tests = should_run_python_tests
-        self.should_run_r_tests = should_run_r_tests
-
-        self.dependent_modules = set()
-        for dep in dependencies:
-            dep.dependent_modules.add(self)
-        all_modules.append(self)
-
-    def contains_file(self, filename):
-        return any(re.match(p, filename) for p in self.source_file_prefixes)
-
-
-sql = Module(
-    name="sql",
-    dependencies=[],
-    source_file_regexes=[
-        "sql/(?!hive-thriftserver)",
-        "bin/spark-sql",
-    ],
-    build_profile_flags=[
-        "-Phive",
-    ],
-    sbt_test_goals=[
-        "catalyst/test",
-        "sql/test",
-        "hive/test",
-    ]
-)
-
-
-hive_thriftserver = Module(
-    name="hive-thriftserver",
-    dependencies=[sql],
-    source_file_regexes=[
-        "sql/hive-thriftserver",
-        "sbin/start-thriftserver.sh",
-    ],
-    build_profile_flags=[
-        "-Phive-thriftserver",
-    ],
-    sbt_test_goals=[
-        "hive-thriftserver/test",
-    ]
-)
-
-
-graphx = Module(
-    name="graphx",
-    dependencies=[],
-    source_file_regexes=[
-        "graphx/",
-    ],
-    sbt_test_goals=[
-        "graphx/test"
-    ]
-)
-
-
-streaming = Module(
-    name="streaming",
-    dependencies=[],
-    source_file_regexes=[
-        "streaming",
-    ],
-    sbt_test_goals=[
-        "streaming/test",
-    ]
-)
-
-
-streaming_kinesis_asl = Module(
-    name="kinesis-asl",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "extras/kinesis-asl/",
-    ],
-    build_profile_flags=[
-        "-Pkinesis-asl",
-    ],
-    sbt_test_goals=[
-        "kinesis-asl/test",
-    ]
-)
-
-
-streaming_zeromq = Module(
-    name="streaming-zeromq",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/zeromq",
-    ],
-    sbt_test_goals=[
-        "streaming-zeromq/test",
-    ]
-)
-
-
-streaming_twitter = Module(
-    name="streaming-twitter",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/twitter",
-    ],
-    sbt_test_goals=[
-        "streaming-twitter/test",
-    ]
-)
-
-
-streaming_mqtt = Module(
-    name="streaming-mqtt",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/mqtt",
-    ],
-    sbt_test_goals=[
-        "streaming-mqtt/test",
-    ]
-)
-
-
-streaming_kafka = Module(
-    name="streaming-kafka",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/kafka",
-        "external/kafka-assembly",
-    ],
-    sbt_test_goals=[
-        "streaming-kafka/test",
-    ]
-)
-
-
-streaming_flume_sink = Module(
-    name="streaming-flume-sink",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/flume-sink",
-    ],
-    sbt_test_goals=[
-        "streaming-flume-sink/test",
-    ]
-)
-
-
-streaming_flume = Module(
-    name="streaming_flume",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/flume",
-    ],
-    sbt_test_goals=[
-        "streaming-flume/test",
-    ]
-)
-
-
-mllib = Module(
-    name="mllib",
-    dependencies=[streaming, sql],
-    source_file_regexes=[
-        "data/mllib/",
-        "mllib/",
-    ],
-    sbt_test_goals=[
-        "mllib/test",
-    ]
-)
-
-
-examples = Module(
-    name="examples",
-    dependencies=[graphx, mllib, streaming, sql],
-    source_file_regexes=[
-        "examples/",
-    ],
-    sbt_test_goals=[
-        "examples/test",
-    ]
-)
-
-
-pyspark = Module(
-    name="pyspark",
-    dependencies=[mllib, streaming, streaming_kafka, sql],
-    source_file_regexes=[
-        "python/"
-    ],
-    should_run_python_tests=True
-)
-
-
-sparkr = Module(
-    name="sparkr",
-    dependencies=[sql, mllib],
-    source_file_regexes=[
-        "R/",
-    ],
-    should_run_r_tests=True
-)
-
-
-docs = Module(
-    name="docs",
-    dependencies=[],
-    source_file_regexes=[
-        "docs/",
-    ]
-)
-
-
-ec2 = Module(
-    name="ec2",
-    dependencies=[],
-    source_file_regexes=[
-        "ec2/",
-    ]
-)
-
-
-# The root module is a dummy module which is used to run all of the tests.
-# No other modules should directly depend on this module.
-root = Module(
-    name="root",
-    dependencies=[],
-    source_file_regexes=[],
-    # In order to run all of the tests, enable every test profile:
-    build_profile_flags=
-        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
-    sbt_test_goals=[
-        "test",
-    ],
-    should_run_python_tests=True,
-    should_run_r_tests=True
-)
-
-
 def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.
@@ -315,19 +41,19 @@ def determine_modules_for_files(filenames):
     file to belong to the 'root' module.
 
     >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
-    ['pyspark', 'sql']
+    ['pyspark-core', 'sql']
     >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
     ['root']
     """
     changed_modules = set()
     for filename in filenames:
         matched_at_least_one_module = False
-        for module in all_modules:
+        for module in modules.all_modules:
             if module.contains_file(filename):
                 changed_modules.add(module)
                 matched_at_least_one_module = True
         if not matched_at_least_one_module:
-            changed_modules.add(root)
+            changed_modules.add(modules.root)
     return changed_modules
 
 
@@ -352,7 +78,8 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
         run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
     else:
         diff_target = target_ref
-    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target])
+    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
+                                         universal_newlines=True)
     # Remove any empty strings
     return [f for f in raw_output.split('\n') if f]
 
@@ -362,18 +89,20 @@ def determine_modules_to_test(changed_modules):
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.
 
-    >>> sorted(x.name for x in determine_modules_to_test([root]))
+    >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
     ['root']
-    >>> sorted(x.name for x in determine_modules_to_test([graphx]))
+    >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
     ['examples', 'graphx']
-    >>> sorted(x.name for x in determine_modules_to_test([sql]))
-    ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
+    >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+    >>> x # doctest: +NORMALIZE_WHITESPACE
+    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', 'pyspark-ml', \
+     'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming', 'sparkr', 'sql']
     """
     # If we're going to have to run all of the tests, then we can just short-circuit
     # and return 'root'. No module depends on root, so if it appears then it will be
     # in changed_modules.
-    if root in changed_modules:
-        return [root]
+    if modules.root in changed_modules:
+        return [modules.root]
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
@@ -398,60 +127,6 @@ def get_error_codes(err_code_file):
 ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
 
 
-def exit_from_command_with_retcode(cmd, retcode):
-    print "[error] running", ' '.join(cmd), "; received return code", retcode
-    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
-
-
-def rm_r(path):
-    """Given an arbitrary path properly remove it with the correct python
-    construct if it exists
-    - from: http://stackoverflow.com/a/9559881"""
-
-    if os.path.isdir(path):
-        shutil.rmtree(path)
-    elif os.path.exists(path):
-        os.remove(path)
-
-
-def run_cmd(cmd):
-    """Given a command as a list of arguments will attempt to execute the
-    command from the determined SPARK_HOME directory and, on failure, print
-    an error message"""
-
-    if not isinstance(cmd, list):
-        cmd = cmd.split()
-    try:
-        subprocess.check_call(cmd)
-    except subprocess.CalledProcessError as e:
-        exit_from_command_with_retcode(e.cmd, e.returncode)
-
-
-def is_exe(path):
-    """Check if a given path is an executable file
-    - from: http://stackoverflow.com/a/377028"""
-
-    return os.path.isfile(path) and os.access(path, os.X_OK)
-
-
-def which(program):
-    """Find and return the given program by its absolute path or 'None'
-    - from: http://stackoverflow.com/a/377028"""
-
-    fpath = os.path.split(program)[0]
-
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ.get("PATH").split(os.pathsep):
-            path = path.strip('"')
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-    return None
-
-
 def determine_java_executable():
     """Will return the path of the java executable that will be used by Spark's
     tests or `None`"""
@@ -476,7 +151,8 @@ def determine_java_version(java_exe):
     with accessors '.major', '.minor', '.patch', '.update'"""
 
     raw_output = subprocess.check_output([java_exe, "-version"],
-                                         stderr=subprocess.STDOUT)
+                                         stderr=subprocess.STDOUT,
+                                         universal_newlines=True)
 
     raw_output_lines = raw_output.split('\n')
 
@@ -504,10 +180,10 @@ def set_title_and_block(title, err_block):
     os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
     line_str = '=' * 72
 
-    print
-    print line_str
-    print title
-    print line_str
+    print('')
+    print(line_str)
+    print(title)
+    print(line_str)
 
 
 def run_apache_rat_checks():
@@ -534,8 +210,8 @@ def build_spark_documentation():
     jekyll_bin = which("jekyll")
 
     if not jekyll_bin:
-        print "[error] Cannot find a version of `jekyll` on the system; please",
-        print "install one and retry to build documentation."
+        print("[error] Cannot find a version of `jekyll` on the system; please"
+              " install one and retry to build documentation.")
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
     else:
         run_cmd([jekyll_bin, "build"])
@@ -571,7 +247,7 @@ def exec_sbt(sbt_args=()):
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, ''):
         if not sbt_output_filter.match(line):
-            print line,
+            print(line, end='')
     retcode = sbt_proc.wait()
 
     if retcode > 0:
@@ -594,33 +270,33 @@ def get_hadoop_profiles(hadoop_version):
     if hadoop_version in sbt_maven_hadoop_profiles:
         return sbt_maven_hadoop_profiles[hadoop_version]
     else:
-        print "[error] Could not find", hadoop_version, "in the list. Valid options",
-        print "are", sbt_maven_hadoop_profiles.keys()
+        print("[error] Could not find", hadoop_version, "in the list. Valid options"
+              " are", sbt_maven_hadoop_profiles.keys())
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
 def build_spark_maven(hadoop_version):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
-    print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_maven(profiles_and_goals)
 
 
 def build_spark_sbt(hadoop_version):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
     sbt_goals = ["package",
                  "assembly/assembly",
                  "streaming-kafka-assembly/assembly"]
     profiles_and_goals = build_profiles + sbt_goals
 
-    print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_sbt(profiles_and_goals)
 
@@ -648,8 +324,8 @@ def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
     profiles_and_goals = test_profiles + mvn_test_goals
 
-    print "[info] Running Spark tests using Maven with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Running Spark tests using Maven with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_maven(profiles_and_goals)
 
@@ -663,8 +339,8 @@ def run_scala_tests_sbt(test_modules, test_profiles):
 
     profiles_and_goals = test_profiles + list(sbt_test_goals)
 
-    print "[info] Running Spark tests using SBT with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Running Spark tests using SBT with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_sbt(profiles_and_goals)
 
@@ -684,10 +360,13 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
         run_scala_tests_sbt(test_modules, test_profiles)
 
 
-def run_python_tests():
+def run_python_tests(test_modules):
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
 
-    run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")])
+    command = [os.path.join(SPARK_HOME, "python", "run-tests")]
+    if test_modules != [modules.root]:
+        command.append("--modules=%s" % ','.join(m.name for m in modules))
+    run_cmd(command)
 
 
 def run_sparkr_tests():
@@ -697,14 +376,14 @@ def run_sparkr_tests():
         run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
         run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
     else:
-        print "Ignoring SparkR tests as R was not found in PATH"
+        print("Ignoring SparkR tests as R was not found in PATH")
 
 
 def main():
     # Ensure the user home directory (HOME) is valid and is an absolute directory
     if not USER_HOME or not os.path.isabs(USER_HOME):
-        print "[error] Cannot determine your home directory as an absolute path;",
-        print "ensure the $HOME environment variable is set properly."
+        print("[error] Cannot determine your home directory as an absolute path;"
+              " ensure the $HOME environment variable is set properly.")
         sys.exit(1)
 
     os.chdir(SPARK_HOME)
@@ -718,14 +397,14 @@ def main():
     java_exe = determine_java_executable()
 
     if not java_exe:
-        print "[error] Cannot find a version of `java` on the system; please",
-        print "install one and retry."
+        print("[error] Cannot find a version of `java` on the system; please"
+              " install one and retry.")
         sys.exit(2)
 
     java_version = determine_java_version(java_exe)
 
     if java_version.minor < 8:
-        print "[warn] Java 8 tests will not run because JDK version is < 1.8."
+        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
 
     if os.environ.get("AMPLAB_JENKINS"):
         # if we're on the Amplab Jenkins build servers setup variables
@@ -741,8 +420,8 @@ def main():
         hadoop_version = "hadoop2.3"
         test_env = "local"
 
-    print "[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
-    print "under environment", test_env
+    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
+          "under environment", test_env)
 
     changed_modules = None
     changed_files = None
@@ -751,8 +430,9 @@ def main():
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
     if not changed_modules:
-        changed_modules = [root]
-    print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
+        changed_modules = [modules.root]
+    print("[info] Found the following changed modules:",
+          ", ".join(x.name for x in changed_modules))
 
     test_modules = determine_modules_to_test(changed_modules)
 
@@ -779,8 +459,9 @@ def main():
     # run the test suites
     run_scala_tests(build_tool, hadoop_version, test_modules)
 
-    if any(m.should_run_python_tests for m in test_modules):
-        run_python_tests()
+    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
+    if modules_with_python_tests:
+        run_python_tests(modules_with_python_tests)
     if any(m.should_run_r_tests for m in test_modules):
         run_sparkr_tests()