aboutsummaryrefslogtreecommitdiff
path: root/dev/run-tests.py
diff options
context:
space:
mode:
Diffstat (limited to 'dev/run-tests.py')
-rwxr-xr-xdev/run-tests.py435
1 files changed, 58 insertions, 377 deletions
diff --git a/dev/run-tests.py b/dev/run-tests.py
index e7c09b0f40..c51b0d3010 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -17,297 +17,23 @@
# limitations under the License.
#
+from __future__ import print_function
import itertools
import os
import re
import sys
-import shutil
import subprocess
from collections import namedtuple
-SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
-USER_HOME = os.environ.get("HOME")
-
+from sparktestsupport import SPARK_HOME, USER_HOME
+from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
+import sparktestsupport.modules as modules
# -------------------------------------------------------------------------------------------------
-# Test module definitions and functions for traversing module dependency graph
+# Functions for traversing module dependency graph
# -------------------------------------------------------------------------------------------------
-all_modules = []
-
-
-class Module(object):
- """
- A module is the basic abstraction in our test runner script. Each module consists of a set of
- source files, a set of test commands, and a set of dependencies on other modules. We use modules
- to define a dependency graph that lets determine which tests to run based on which files have
- changed.
- """
-
- def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
- sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False):
- """
- Define a new module.
-
- :param name: A short module name, for display in logging and error messages.
- :param dependencies: A set of dependencies for this module. This should only include direct
- dependencies; transitive dependencies are resolved automatically.
- :param source_file_regexes: a set of regexes that match source files belonging to this
- module. These regexes are applied by attempting to match at the beginning of the
- filename strings.
- :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
- order to build and test this module (e.g. '-PprofileName').
- :param sbt_test_goals: A set of SBT test goals for testing this module.
- :param should_run_python_tests: If true, changes in this module will trigger Python tests.
- For now, this has the effect of causing _all_ Python tests to be run, although in the
- future this should be changed to run only a subset of the Python tests that depend
- on this module.
- :param should_run_r_tests: If true, changes in this module will trigger all R tests.
- """
- self.name = name
- self.dependencies = dependencies
- self.source_file_prefixes = source_file_regexes
- self.sbt_test_goals = sbt_test_goals
- self.build_profile_flags = build_profile_flags
- self.should_run_python_tests = should_run_python_tests
- self.should_run_r_tests = should_run_r_tests
-
- self.dependent_modules = set()
- for dep in dependencies:
- dep.dependent_modules.add(self)
- all_modules.append(self)
-
- def contains_file(self, filename):
- return any(re.match(p, filename) for p in self.source_file_prefixes)
-
-
-sql = Module(
- name="sql",
- dependencies=[],
- source_file_regexes=[
- "sql/(?!hive-thriftserver)",
- "bin/spark-sql",
- ],
- build_profile_flags=[
- "-Phive",
- ],
- sbt_test_goals=[
- "catalyst/test",
- "sql/test",
- "hive/test",
- ]
-)
-
-
-hive_thriftserver = Module(
- name="hive-thriftserver",
- dependencies=[sql],
- source_file_regexes=[
- "sql/hive-thriftserver",
- "sbin/start-thriftserver.sh",
- ],
- build_profile_flags=[
- "-Phive-thriftserver",
- ],
- sbt_test_goals=[
- "hive-thriftserver/test",
- ]
-)
-
-
-graphx = Module(
- name="graphx",
- dependencies=[],
- source_file_regexes=[
- "graphx/",
- ],
- sbt_test_goals=[
- "graphx/test"
- ]
-)
-
-
-streaming = Module(
- name="streaming",
- dependencies=[],
- source_file_regexes=[
- "streaming",
- ],
- sbt_test_goals=[
- "streaming/test",
- ]
-)
-
-
-streaming_kinesis_asl = Module(
- name="kinesis-asl",
- dependencies=[streaming],
- source_file_regexes=[
- "extras/kinesis-asl/",
- ],
- build_profile_flags=[
- "-Pkinesis-asl",
- ],
- sbt_test_goals=[
- "kinesis-asl/test",
- ]
-)
-
-
-streaming_zeromq = Module(
- name="streaming-zeromq",
- dependencies=[streaming],
- source_file_regexes=[
- "external/zeromq",
- ],
- sbt_test_goals=[
- "streaming-zeromq/test",
- ]
-)
-
-
-streaming_twitter = Module(
- name="streaming-twitter",
- dependencies=[streaming],
- source_file_regexes=[
- "external/twitter",
- ],
- sbt_test_goals=[
- "streaming-twitter/test",
- ]
-)
-
-
-streaming_mqtt = Module(
- name="streaming-mqtt",
- dependencies=[streaming],
- source_file_regexes=[
- "external/mqtt",
- ],
- sbt_test_goals=[
- "streaming-mqtt/test",
- ]
-)
-
-
-streaming_kafka = Module(
- name="streaming-kafka",
- dependencies=[streaming],
- source_file_regexes=[
- "external/kafka",
- "external/kafka-assembly",
- ],
- sbt_test_goals=[
- "streaming-kafka/test",
- ]
-)
-
-
-streaming_flume_sink = Module(
- name="streaming-flume-sink",
- dependencies=[streaming],
- source_file_regexes=[
- "external/flume-sink",
- ],
- sbt_test_goals=[
- "streaming-flume-sink/test",
- ]
-)
-
-
-streaming_flume = Module(
- name="streaming_flume",
- dependencies=[streaming],
- source_file_regexes=[
- "external/flume",
- ],
- sbt_test_goals=[
- "streaming-flume/test",
- ]
-)
-
-
-mllib = Module(
- name="mllib",
- dependencies=[streaming, sql],
- source_file_regexes=[
- "data/mllib/",
- "mllib/",
- ],
- sbt_test_goals=[
- "mllib/test",
- ]
-)
-
-
-examples = Module(
- name="examples",
- dependencies=[graphx, mllib, streaming, sql],
- source_file_regexes=[
- "examples/",
- ],
- sbt_test_goals=[
- "examples/test",
- ]
-)
-
-
-pyspark = Module(
- name="pyspark",
- dependencies=[mllib, streaming, streaming_kafka, sql],
- source_file_regexes=[
- "python/"
- ],
- should_run_python_tests=True
-)
-
-
-sparkr = Module(
- name="sparkr",
- dependencies=[sql, mllib],
- source_file_regexes=[
- "R/",
- ],
- should_run_r_tests=True
-)
-
-
-docs = Module(
- name="docs",
- dependencies=[],
- source_file_regexes=[
- "docs/",
- ]
-)
-
-
-ec2 = Module(
- name="ec2",
- dependencies=[],
- source_file_regexes=[
- "ec2/",
- ]
-)
-
-
-# The root module is a dummy module which is used to run all of the tests.
-# No other modules should directly depend on this module.
-root = Module(
- name="root",
- dependencies=[],
- source_file_regexes=[],
- # In order to run all of the tests, enable every test profile:
- build_profile_flags=
- list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
- sbt_test_goals=[
- "test",
- ],
- should_run_python_tests=True,
- should_run_r_tests=True
-)
-
-
def determine_modules_for_files(filenames):
"""
Given a list of filenames, return the set of modules that contain those files.
@@ -315,19 +41,19 @@ def determine_modules_for_files(filenames):
file to belong to the 'root' module.
>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
- ['pyspark', 'sql']
+ ['pyspark-core', 'sql']
>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
['root']
"""
changed_modules = set()
for filename in filenames:
matched_at_least_one_module = False
- for module in all_modules:
+ for module in modules.all_modules:
if module.contains_file(filename):
changed_modules.add(module)
matched_at_least_one_module = True
if not matched_at_least_one_module:
- changed_modules.add(root)
+ changed_modules.add(modules.root)
return changed_modules
@@ -352,7 +78,8 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
else:
diff_target = target_ref
- raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target])
+ raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
+ universal_newlines=True)
# Remove any empty strings
return [f for f in raw_output.split('\n') if f]
@@ -362,18 +89,20 @@ def determine_modules_to_test(changed_modules):
Given a set of modules that have changed, compute the transitive closure of those modules'
dependent modules in order to determine the set of modules that should be tested.
- >>> sorted(x.name for x in determine_modules_to_test([root]))
+ >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
['root']
- >>> sorted(x.name for x in determine_modules_to_test([graphx]))
+ >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
['examples', 'graphx']
- >>> sorted(x.name for x in determine_modules_to_test([sql]))
- ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
+ >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+ >>> x # doctest: +NORMALIZE_WHITESPACE
+ ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', 'pyspark-ml', \
+ 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming', 'sparkr', 'sql']
"""
# If we're going to have to run all of the tests, then we can just short-circuit
# and return 'root'. No module depends on root, so if it appears then it will be
# in changed_modules.
- if root in changed_modules:
- return [root]
+ if modules.root in changed_modules:
+ return [modules.root]
modules_to_test = set()
for module in changed_modules:
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
@@ -398,60 +127,6 @@ def get_error_codes(err_code_file):
ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
-def exit_from_command_with_retcode(cmd, retcode):
- print "[error] running", ' '.join(cmd), "; received return code", retcode
- sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
-
-
-def rm_r(path):
- """Given an arbitrary path properly remove it with the correct python
- construct if it exists
- - from: http://stackoverflow.com/a/9559881"""
-
- if os.path.isdir(path):
- shutil.rmtree(path)
- elif os.path.exists(path):
- os.remove(path)
-
-
-def run_cmd(cmd):
- """Given a command as a list of arguments will attempt to execute the
- command from the determined SPARK_HOME directory and, on failure, print
- an error message"""
-
- if not isinstance(cmd, list):
- cmd = cmd.split()
- try:
- subprocess.check_call(cmd)
- except subprocess.CalledProcessError as e:
- exit_from_command_with_retcode(e.cmd, e.returncode)
-
-
-def is_exe(path):
- """Check if a given path is an executable file
- - from: http://stackoverflow.com/a/377028"""
-
- return os.path.isfile(path) and os.access(path, os.X_OK)
-
-
-def which(program):
- """Find and return the given program by its absolute path or 'None'
- - from: http://stackoverflow.com/a/377028"""
-
- fpath = os.path.split(program)[0]
-
- if fpath:
- if is_exe(program):
- return program
- else:
- for path in os.environ.get("PATH").split(os.pathsep):
- path = path.strip('"')
- exe_file = os.path.join(path, program)
- if is_exe(exe_file):
- return exe_file
- return None
-
-
def determine_java_executable():
"""Will return the path of the java executable that will be used by Spark's
tests or `None`"""
@@ -476,7 +151,8 @@ def determine_java_version(java_exe):
with accessors '.major', '.minor', '.patch', '.update'"""
raw_output = subprocess.check_output([java_exe, "-version"],
- stderr=subprocess.STDOUT)
+ stderr=subprocess.STDOUT,
+ universal_newlines=True)
raw_output_lines = raw_output.split('\n')
@@ -504,10 +180,10 @@ def set_title_and_block(title, err_block):
os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
line_str = '=' * 72
- print
- print line_str
- print title
- print line_str
+ print('')
+ print(line_str)
+ print(title)
+ print(line_str)
def run_apache_rat_checks():
@@ -534,8 +210,8 @@ def build_spark_documentation():
jekyll_bin = which("jekyll")
if not jekyll_bin:
- print "[error] Cannot find a version of `jekyll` on the system; please",
- print "install one and retry to build documentation."
+ print("[error] Cannot find a version of `jekyll` on the system; please"
+ " install one and retry to build documentation.")
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
else:
run_cmd([jekyll_bin, "build"])
@@ -571,7 +247,7 @@ def exec_sbt(sbt_args=()):
echo_proc.wait()
for line in iter(sbt_proc.stdout.readline, ''):
if not sbt_output_filter.match(line):
- print line,
+ print(line, end='')
retcode = sbt_proc.wait()
if retcode > 0:
@@ -594,33 +270,33 @@ def get_hadoop_profiles(hadoop_version):
if hadoop_version in sbt_maven_hadoop_profiles:
return sbt_maven_hadoop_profiles[hadoop_version]
else:
- print "[error] Could not find", hadoop_version, "in the list. Valid options",
- print "are", sbt_maven_hadoop_profiles.keys()
+ print("[error] Could not find", hadoop_version, "in the list. Valid options"
+ " are", sbt_maven_hadoop_profiles.keys())
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
def build_spark_maven(hadoop_version):
# Enable all of the profiles for the build:
- build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
+ build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
mvn_goals = ["clean", "package", "-DskipTests"]
profiles_and_goals = build_profiles + mvn_goals
- print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:",
- print " ".join(profiles_and_goals)
+ print("[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments: "
+ " ".join(profiles_and_goals))
exec_maven(profiles_and_goals)
def build_spark_sbt(hadoop_version):
# Enable all of the profiles for the build:
- build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
+ build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
sbt_goals = ["package",
"assembly/assembly",
"streaming-kafka-assembly/assembly"]
profiles_and_goals = build_profiles + sbt_goals
- print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:",
- print " ".join(profiles_and_goals)
+ print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: "
+ " ".join(profiles_and_goals))
exec_sbt(profiles_and_goals)
@@ -648,8 +324,8 @@ def run_scala_tests_maven(test_profiles):
mvn_test_goals = ["test", "--fail-at-end"]
profiles_and_goals = test_profiles + mvn_test_goals
- print "[info] Running Spark tests using Maven with these arguments:",
- print " ".join(profiles_and_goals)
+ print("[info] Running Spark tests using Maven with these arguments: "
+ " ".join(profiles_and_goals))
exec_maven(profiles_and_goals)
@@ -663,8 +339,8 @@ def run_scala_tests_sbt(test_modules, test_profiles):
profiles_and_goals = test_profiles + list(sbt_test_goals)
- print "[info] Running Spark tests using SBT with these arguments:",
- print " ".join(profiles_and_goals)
+ print("[info] Running Spark tests using SBT with these arguments: "
+ " ".join(profiles_and_goals))
exec_sbt(profiles_and_goals)
@@ -684,10 +360,13 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
run_scala_tests_sbt(test_modules, test_profiles)
-def run_python_tests():
+def run_python_tests(test_modules):
set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
- run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")])
+ command = [os.path.join(SPARK_HOME, "python", "run-tests")]
+ if test_modules != [modules.root]:
+ command.append("--modules=%s" % ','.join(m.name for m in modules))
+ run_cmd(command)
def run_sparkr_tests():
@@ -697,14 +376,14 @@ def run_sparkr_tests():
run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
else:
- print "Ignoring SparkR tests as R was not found in PATH"
+ print("Ignoring SparkR tests as R was not found in PATH")
def main():
# Ensure the user home directory (HOME) is valid and is an absolute directory
if not USER_HOME or not os.path.isabs(USER_HOME):
- print "[error] Cannot determine your home directory as an absolute path;",
- print "ensure the $HOME environment variable is set properly."
+ print("[error] Cannot determine your home directory as an absolute path;"
+ " ensure the $HOME environment variable is set properly.")
sys.exit(1)
os.chdir(SPARK_HOME)
@@ -718,14 +397,14 @@ def main():
java_exe = determine_java_executable()
if not java_exe:
- print "[error] Cannot find a version of `java` on the system; please",
- print "install one and retry."
+ print("[error] Cannot find a version of `java` on the system; please"
+ " install one and retry.")
sys.exit(2)
java_version = determine_java_version(java_exe)
if java_version.minor < 8:
- print "[warn] Java 8 tests will not run because JDK version is < 1.8."
+ print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
if os.environ.get("AMPLAB_JENKINS"):
# if we're on the Amplab Jenkins build servers setup variables
@@ -741,8 +420,8 @@ def main():
hadoop_version = "hadoop2.3"
test_env = "local"
- print "[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
- print "under environment", test_env
+ print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
+ "under environment", test_env)
changed_modules = None
changed_files = None
@@ -751,8 +430,9 @@ def main():
changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
changed_modules = determine_modules_for_files(changed_files)
if not changed_modules:
- changed_modules = [root]
- print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
+ changed_modules = [modules.root]
+ print("[info] Found the following changed modules:",
+ ", ".join(x.name for x in changed_modules))
test_modules = determine_modules_to_test(changed_modules)
@@ -779,8 +459,9 @@ def main():
# run the test suites
run_scala_tests(build_tool, hadoop_version, test_modules)
- if any(m.should_run_python_tests for m in test_modules):
- run_python_tests()
+ modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
+ if modules_with_python_tests:
+ run_python_tests(modules_with_python_tests)
if any(m.should_run_r_tests for m in test_modules):
run_sparkr_tests()