dev/create-release/generate-contributors.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

#!/usr/bin/env python

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script automates the process of creating release notes.

import os
import re
import sys

from releaseutils import *

# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")

# If the release tags are not provided, prompt the user to provide them
while not tag_exists(RELEASE_TAG):
    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
while not tag_exists(PREVIOUS_RELEASE_TAG):
    print("Please specify the previous release tag.")
    PREVIOUS_RELEASE_TAG = raw_input(
        "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")

# Gather commits found in the new tag but not in the old tag.
# This filters commits based on both the git hash and the PR number.
# If either is present in the old tag, then we ignore the commit.
print("Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
release_commits = get_commits(RELEASE_TAG)
previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
previous_release_hashes = set()
previous_release_prs = set()
for old_commit in previous_release_commits:
    previous_release_hashes.add(old_commit.get_hash())
    if old_commit.get_pr_number():
        previous_release_prs.add(old_commit.get_pr_number())
new_commits = []
for this_commit in release_commits:
    this_hash = this_commit.get_hash()
    this_pr_number = this_commit.get_pr_number()
    if this_hash in previous_release_hashes:
        continue
    if this_pr_number and this_pr_number in previous_release_prs:
        continue
    new_commits.append(this_commit)
if not new_commits:
    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))

# Prompt the user for confirmation that the commit range is correct
print("\n==================================================================================")
print("JIRA server: %s" % JIRA_API_BASE)
print("Release tag: %s" % RELEASE_TAG)
print("Previous release tag: %s" % PREVIOUS_RELEASE_TAG)
print("Number of commits in this range: %s" % len(new_commits))
print


def print_indented(_list):
    for x in _list:
        print("  %s" % x)
if yesOrNoPrompt("Show all commits?"):
    print_indented(new_commits)
print("==================================================================================\n")
if not yesOrNoPrompt("Does this look correct?"):
    sys.exit("Ok, exiting")

# Filter out special commits
releases = []
maintenance = []
reverts = []
nojiras = []
filtered_commits = []


def is_release(commit_title):
    return re.findall("\[release\]", commit_title.lower()) or \
        "preparing spark release" in commit_title.lower() or \
        "preparing development version" in commit_title.lower() or \
        "CHANGES.txt" in commit_title


def is_maintenance(commit_title):
    return "maintenance" in commit_title.lower() or \
        "manually close" in commit_title.lower()


def has_no_jira(commit_title):
    return not re.findall("SPARK-[0-9]+", commit_title.upper())


def is_revert(commit_title):
    return "revert" in commit_title.lower()


def is_docs(commit_title):
    return re.findall("docs*", commit_title.lower()) or \
        "programming guide" in commit_title.lower()


for c in new_commits:
    t = c.get_title()
    if not t:
        continue
    elif is_release(t):
        releases.append(c)
    elif is_maintenance(t):
        maintenance.append(c)
    elif is_revert(t):
        reverts.append(c)
    elif is_docs(t):
        filtered_commits.append(c)  # docs may not have JIRA numbers
    elif has_no_jira(t):
        nojiras.append(c)
    else:
        filtered_commits.append(c)

# Warn against ignored commits
if releases or maintenance or reverts or nojiras:
    print("\n==================================================================================")
    if releases:
        print("Found %d release commits" % len(releases))
    if maintenance:
        print("Found %d maintenance commits" % len(maintenance))
    if reverts:
        print("Found %d revert commits" % len(reverts))
    if nojiras:
        print("Found %d commits with no JIRA" % len(nojiras))
    print("* Warning: these commits will be ignored.\n")
    if yesOrNoPrompt("Show ignored commits?"):
        if releases:
            print("Release (%d)" % len(releases))
            print_indented(releases)
        if maintenance:
            print("Maintenance (%d)" % len(maintenance))
            print_indented(maintenance)
        if reverts:
            print("Revert (%d)" % len(reverts))
            print_indented(reverts)
        if nojiras:
            print("No JIRA (%d)" % len(nojiras))
            print_indented(nojiras)
    print("==================== Warning: the above commits will be ignored ==================\n")
prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
if not yesOrNoPrompt(prompt_msg):
    sys.exit("Ok, exiting.")

# Keep track of warnings to tell the user at the end
warnings = []

# Mapping from the invalid author name to its associated JIRA issues
# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
invalid_authors = {}

# Populate a map that groups issues and components by author
# It takes the form: Author name -> { Contribution type -> Spark components }
# For instance,
# {
#   'Andrew Or': {
#     'bug fixes': ['windows', 'core', 'web ui'],
#     'improvements': ['core']
#   },
#   'Tathagata Das' : {
#     'bug fixes': ['streaming']
#     'new feature': ['streaming']
#   }
# }
#
author_info = {}
jira_options = {"server": JIRA_API_BASE}
jira_client = JIRA(options=jira_options)
print("\n=========================== Compiling contributor list ===========================")
for commit in filtered_commits:
    _hash = commit.get_hash()
    title = commit.get_title()
    issues = re.findall("SPARK-[0-9]+", title.upper())
    author = commit.get_author()
    date = get_date(_hash)
    # If the author name is invalid, keep track of it along
    # with all associated issues so we can translate it later
    if is_valid_author(author):
        author = capitalize_author(author)
    else:
        if author not in invalid_authors:
            invalid_authors[author] = set()
        for issue in issues:
            invalid_authors[author].add(issue)
    # Parse components from the commit title, if any
    commit_components = find_components(title, _hash)
    # Populate or merge an issue into author_info[author]

    def populate(issue_type, components):
        components = components or [CORE_COMPONENT]  # assume core if no components provided
        if author not in author_info:
            author_info[author] = {}
        if issue_type not in author_info[author]:
            author_info[author][issue_type] = set()
        for component in components:
            author_info[author][issue_type].add(component)
    # Find issues and components associated with this commit
    for issue in issues:
        try:
            jira_issue = jira_client.issue(issue)
            jira_type = jira_issue.fields.issuetype.name
            jira_type = translate_issue_type(jira_type, issue, warnings)
            jira_components = [translate_component(c.name, _hash, warnings)
                               for c in jira_issue.fields.components]
            all_components = set(jira_components + commit_components)
            populate(jira_type, all_components)
        except Exception as e:
            print("Unexpected error:", e)
    # For docs without an associated JIRA, manually add it ourselves
    if is_docs(title) and not issues:
        populate("documentation", commit_components)
    print("  Processed commit %s authored by %s on %s" % (_hash, author, date))
print("==================================================================================\n")

# Write to contributors file ordered by author names
# Each line takes the format " * Author name -- semi-colon delimited contributions"
# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core
# e.g. * Tathagata Das -- Bug fixes and new features in Streaming
contributors_file = open(contributors_file_name, "w")
authors = author_info.keys()
authors.sort()
for author in authors:
    contribution = ""
    components = set()
    issue_types = set()
    for issue_type, comps in author_info[author].items():
        components.update(comps)
        issue_types.add(issue_type)
    # If there is only one component, mention it only once
    # e.g. Bug fixes, improvements in MLlib
    if len(components) == 1:
        contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
    # Otherwise, group contributions by issue types instead of modules
    # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
    else:
        contributions = ["%s in %s" % (issue_type, nice_join(comps))
                         for issue_type, comps in author_info[author].items()]
        contribution = "; ".join(contributions)
    # Do not use python's capitalize() on the whole string to preserve case
    assert contribution
    contribution = contribution[0].capitalize() + contribution[1:]
    # If the author name is invalid, use an intermediate format that
    # can be translated through translate-contributors.py later
    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
    if author in invalid_authors and invalid_authors[author]:
        author = author + "/" + "/".join(invalid_authors[author])
    # line = " * %s -- %s" % (author, contribution)
    line = author
    contributors_file.write(line + "\n")
contributors_file.close()
print("Contributors list is successfully written to %s!" % contributors_file_name)

# Prompt the user to translate author names if necessary
if invalid_authors:
    warnings.append("Found the following invalid authors:")
    for a in invalid_authors:
        warnings.append("\t%s" % a)
    warnings.append("Please run './translate-contributors.py' to translate them.")

# Log any warnings encountered in the process
if warnings:
    print("\n============ Warnings encountered while creating the contributor list ============")
    for w in warnings:
        print(w)
    print("Please correct these in the final contributors list at %s." % contributors_file_name)
    print("==================================================================================\n")