dev/audit-release/audit_release.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

#!/usr/bin/python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Audits binary and maven artifacts for a Spark release.
# Requires GPG and Maven.
# usage:
#   python audit_release.py

import os
import re
import shutil
import subprocess
import sys
import time
import urllib2

# Note: The following variables must be set before use!
RELEASE_URL = "http://people.apache.org/~andrewor14/spark-1.1.1-rc1/"
RELEASE_KEY = "XXXXXXXX" # Your 8-digit hex
RELEASE_REPOSITORY = "https://repository.apache.org/content/repositories/orgapachespark-1033"
RELEASE_VERSION = "1.1.1"
SCALA_VERSION = "2.11.7"
SCALA_BINARY_VERSION = "2.11"

# Do not set these
LOG_FILE_NAME = "spark_audit_%s" % time.strftime("%h_%m_%Y_%I_%M_%S")
LOG_FILE = open(LOG_FILE_NAME, 'w')
WORK_DIR = "/tmp/audit_%s" % int(time.time())
MAVEN_CMD = "mvn"
GPG_CMD = "gpg"
SBT_CMD = "sbt -Dsbt.log.noformat=true"

# Track failures to print them at the end
failures = []

# Log a message. Use sparingly because this flushes every write.
def log(msg):
    LOG_FILE.write(msg + "\n")
    LOG_FILE.flush()

def log_and_print(msg):
    print msg
    log(msg)

# Prompt the user to delete the scratch directory used
def clean_work_files():
    response = raw_input("OK to delete scratch directory '%s'? (y/N) " % WORK_DIR)
    if response == "y":
        shutil.rmtree(WORK_DIR)

# Run the given command and log its output to the log file
def run_cmd(cmd, exit_on_failure=True):
    log("Running command: %s" % cmd)
    ret = subprocess.call(cmd, shell=True, stdout=LOG_FILE, stderr=LOG_FILE)
    if ret != 0 and exit_on_failure:
        log_and_print("Command failed: %s" % cmd)
        clean_work_files()
        sys.exit(-1)
    return ret

def run_cmd_with_output(cmd):
    log_and_print("Running command: %s" % cmd)
    return subprocess.check_output(cmd, shell=True, stderr=LOG_FILE)

# Test if the given condition is successful
# If so, print the pass message; otherwise print the failure message
def test(cond, msg):
    return passed(msg) if cond else failed(msg)

def passed(msg):
    log_and_print("[PASSED] %s" % msg)

def failed(msg):
    failures.append(msg)
    log_and_print("[**FAILED**] %s" % msg)

def get_url(url):
    return urllib2.urlopen(url).read()

# If the path exists, prompt the user to delete it
# If the resource is not deleted, abort
def ensure_path_not_present(path):
    full_path = os.path.expanduser(path)
    if os.path.exists(full_path):
        print "Found %s locally." % full_path
        response = raw_input("This can interfere with testing published artifacts. OK to delete? (y/N) ")
        if response == "y":
            shutil.rmtree(full_path)
        else:
            print "Abort."
            sys.exit(-1)

log_and_print("|-------- Starting Spark audit tests for release %s --------|" % RELEASE_VERSION)
log_and_print("Log output can be found in %s" % LOG_FILE_NAME)

original_dir = os.getcwd()

# For each of these modules, we'll test an 'empty' application in sbt and
# maven that links against them. This will catch issues with messed up
# dependencies within those projects.
modules = [
    "spark-core", "spark-mllib", "spark-streaming", "spark-repl",
    "spark-graphx", "spark-streaming-kafka",
    "spark-catalyst", "spark-sql", "spark-hive", "spark-streaming-kinesis-asl"
]
modules = map(lambda m: "%s_%s" % (m, SCALA_BINARY_VERSION), modules)

# Check for directories that might interfere with tests
local_ivy_spark = "~/.ivy2/local/org.apache.spark"
cache_ivy_spark = "~/.ivy2/cache/org.apache.spark"
local_maven_kafka = "~/.m2/repository/org/apache/kafka"
local_maven_kafka = "~/.m2/repository/org/apache/spark"
map(ensure_path_not_present, [local_ivy_spark, cache_ivy_spark, local_maven_kafka])

# SBT build tests
log_and_print("==== Building SBT modules ====")
os.chdir("blank_sbt_build")
os.environ["SPARK_VERSION"] = RELEASE_VERSION
os.environ["SCALA_VERSION"] = SCALA_VERSION
os.environ["SPARK_RELEASE_REPOSITORY"] = RELEASE_REPOSITORY
os.environ["SPARK_AUDIT_MASTER"] = "local"
for module in modules:
    log("==== Building module %s in SBT ====" % module)
    os.environ["SPARK_MODULE"] = module
    ret = run_cmd("%s clean update" % SBT_CMD, exit_on_failure=False)
    test(ret == 0, "SBT build against '%s' module" % module)
os.chdir(original_dir)

# SBT application tests
log_and_print("==== Building SBT applications ====")
for app in ["sbt_app_core", "sbt_app_graphx", "sbt_app_streaming", "sbt_app_sql", "sbt_app_hive", "sbt_app_kinesis"]:
    log("==== Building application %s in SBT ====" % app)
    os.chdir(app)
    ret = run_cmd("%s clean run" % SBT_CMD, exit_on_failure=False)
    test(ret == 0, "SBT application (%s)" % app)
    os.chdir(original_dir)

# Maven build tests
os.chdir("blank_maven_build")
log_and_print("==== Building Maven modules ====")
for module in modules:
    log("==== Building module %s in maven ====" % module)
    cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
           '-Dspark.module="%s" clean compile' %
           (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, module))
    ret = run_cmd(cmd, exit_on_failure=False)
    test(ret == 0, "maven build against '%s' module" % module)
os.chdir(original_dir)

# Maven application tests
log_and_print("==== Building Maven applications ====")
os.chdir("maven_app_core")
mvn_exec_cmd = ('%s --update-snapshots -Dspark.release.repository="%s" -Dspark.version="%s" '
                '-Dscala.binary.version="%s" clean compile '
                'exec:java -Dexec.mainClass="SimpleApp"' %
                (MAVEN_CMD, RELEASE_REPOSITORY, RELEASE_VERSION, SCALA_BINARY_VERSION))
ret = run_cmd(mvn_exec_cmd, exit_on_failure=False)
test(ret == 0, "maven application (core)")
os.chdir(original_dir)

# Binary artifact tests
if os.path.exists(WORK_DIR):
    print "Working directory '%s' already exists" % WORK_DIR
    sys.exit(-1)
os.mkdir(WORK_DIR)
os.chdir(WORK_DIR)

index_page = get_url(RELEASE_URL)
artifact_regex = r = re.compile("<a href=\"(.*.tgz)\">")
artifacts = r.findall(index_page)

# Verify artifact integrity
for artifact in artifacts:
    log_and_print("==== Verifying download integrity for artifact: %s ====" % artifact)

    artifact_url = "%s/%s" % (RELEASE_URL, artifact)
    key_file = "%s.asc" % artifact
    run_cmd("wget %s" % artifact_url)
    run_cmd("wget %s/%s" % (RELEASE_URL, key_file))
    run_cmd("wget %s%s" % (artifact_url, ".sha"))

    # Verify signature
    run_cmd("%s --keyserver pgp.mit.edu --recv-key %s" % (GPG_CMD, RELEASE_KEY))
    run_cmd("%s %s" % (GPG_CMD, key_file))
    passed("Artifact signature verified.")

    # Verify md5
    my_md5 = run_cmd_with_output("%s --print-md MD5 %s" % (GPG_CMD, artifact)).strip()
    release_md5 = get_url("%s.md5" % artifact_url).strip()
    test(my_md5 == release_md5, "Artifact MD5 verified.")

    # Verify sha
    my_sha = run_cmd_with_output("%s --print-md SHA512 %s" % (GPG_CMD, artifact)).strip()
    release_sha = get_url("%s.sha" % artifact_url).strip()
    test(my_sha == release_sha, "Artifact SHA verified.")

    # Verify Apache required files
    dir_name = artifact.replace(".tgz", "")
    run_cmd("tar xvzf %s" % artifact)
    base_files = os.listdir(dir_name)
    test("CHANGES.txt" in base_files, "Tarball contains CHANGES.txt file")
    test("NOTICE" in base_files, "Tarball contains NOTICE file")
    test("LICENSE" in base_files, "Tarball contains LICENSE file")

    os.chdir(WORK_DIR)

# Report result
log_and_print("\n")
if len(failures) == 0:
    log_and_print("*** ALL TESTS PASSED ***")
else:
    log_and_print("XXXXX SOME TESTS DID NOT PASS XXXXX")
    for f in failures:
        log_and_print("  %s" % f)
os.chdir(original_dir)

# Clean up
clean_work_files()

log_and_print("|-------- Spark release audit complete --------|")