[Release] Major improvements to generate contributors script

This commit introduces several major improvements to the script that generates the contributors list for release notes, notably: (1) Use release tags instead of a range of commits. Across branches, commits are not actually strictly two-dimensional, and so it is not sufficient to specify a start hash and an end hash. Otherwise, we end up counting commits that were already merged in an older branch. (2) Match PR numbers in addition to commit hashes. This is related to the first point in that if a PR is already merged in an older minor release tag, it should be filtered out here. This requires us to do some intelligent regex parsing on the commit description in addition to just relying on the GitHub API. (3) Relax author validity check. The old code fails on a name that has many middle names, for instance. The test was just too strict. (4) Use GitHub authentication. This allows us to make far more requests through the GitHub API than before (5000 as opposed to 60 per hour). (5) Translate from Github username, not commit author name. This is important because the commit author name is not always configured correctly by the user. For instance, the username "falaki" used to resolve to just "Hossein", which was treated as a github username and translated to something else that is completely arbitrary. (6) Add an option to use the untranslated name. If there is not a satisfactory candidate to replace the untranslated name with, at least allow the user to not translate it.
author: Andrew Or <andrew@databricks.com> 2014-12-16 17:55:27 -0800
committer: Andrew Or <andrew@databricks.com> 2014-12-16 18:05:46 -0800
commit: 0fb00473904ff3643b6f6848e0faa0deeb1d60f5 (patch)
tree: d98a1db6b868cfdc869831fc44771a3008960a78 /dev
parent: 1b6fc237c26d9fcb9d4afc9c93a21f9134231145 (diff)
download: spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.tar.gz
spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.tar.bz2
spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.zip
3 files changed, 206 insertions, 89 deletions
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index a3b78a3eac..e8f81ccbce 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -26,75 +26,103 @@ from releaseutils import *
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-START_COMMIT = os.environ.get("START_COMMIT", "37b100")
-END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
-
-# If commit range is not specified, prompt the user to provide it
-if not START_COMMIT or not END_COMMIT:
-    print "A commit range is required to proceed."
-    if not START_COMMIT:
-        START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ")
-    if not END_COMMIT:
-        END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
-
-# Verify provided arguments
-start_commit_line = get_one_line(START_COMMIT)
-end_commit_line = get_one_line(END_COMMIT)
-num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
-if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT)
-if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT)
-if num_commits == 0:
-    sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT))
+RELEASE_TAG = os.environ.get("START_COMMIT", "v1.2.0-rc2")
+PREVIOUS_RELEASE_TAG = os.environ.get("END_COMMIT", "v1.1.0")
+
+# If the release tags are not provided, prompt the user to provide them
+while not tag_exists(RELEASE_TAG):
+    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
+while not tag_exists(PREVIOUS_RELEASE_TAG):
+    print "Please specify the previous release tag."
+    PREVIOUS_RELEASE_TAG = raw_input(\
+        "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
+
+# Gather commits found in the new tag but not in the old tag.
+# This filters commits based on both the git hash and the PR number.
+# If either is present in the old tag, then we ignore the commit.
+print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)
+release_commits = get_commits(RELEASE_TAG)
+previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
+previous_release_hashes = set()
+previous_release_prs = set()
+for old_commit in previous_release_commits:
+    previous_release_hashes.add(old_commit.get_hash())
+    if old_commit.get_pr_number():
+        previous_release_prs.add(old_commit.get_pr_number())
+new_commits = []
+for this_commit in release_commits:
+    this_hash = this_commit.get_hash()
+    this_pr_number = this_commit.get_pr_number()
+    if this_hash in previous_release_hashes:
+        continue
+    if this_pr_number and this_pr_number in previous_release_prs:
+        continue
+    new_commits.append(this_commit)
+if not new_commits:
+    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
+
+# Prompt the user for confirmation that the commit range is correct
 print "\n=================================================================================="
 print "JIRA server: %s" % JIRA_API_BASE
-print "Start commit (inclusive): %s" % start_commit_line
-print "End commit (non-inclusive): %s" % end_commit_line
-print "Number of commits in this range: %s" % num_commits
+print "Release tag: %s" % RELEASE_TAG
+print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG
+print "Number of commits in this range: %s" % len(new_commits)
 print
-response = raw_input("Is this correct? [Y/n] ")
-if response.lower() != "y" and response:
-    sys.exit("Ok, exiting")
+def print_indented(_list):
+    for x in _list: print "  %s" % x
+if yesOrNoPrompt("Show all commits?"):
+    print_indented(new_commits)
 print "==================================================================================\n"
-
-# Find all commits within this range
-print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
-commits = get_one_line_commits(START_COMMIT, END_COMMIT)
-if not commits: sys.exit("Error: No commits found within this range!")
-commits = commits.split("\n")
+if not yesOrNoPrompt("Does this look correct?"):
+    sys.exit("Ok, exiting")
 
 # Filter out special commits
 releases = []
+maintenance = []
 reverts = []
 nojiras = []
 filtered_commits = []
-def is_release(commit):
-    return re.findall("\[release\]", commit.lower()) or\
-        "maven-release-plugin" in commit or "CHANGES.txt" in commit
-def has_no_jira(commit):
-    return not re.findall("SPARK-[0-9]+", commit.upper())
-def is_revert(commit):
-    return "revert" in commit.lower()
-def is_docs(commit):
-    return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower()
-for c in commits:
-    if not c: continue
-    elif is_release(c): releases.append(c)
-    elif is_revert(c): reverts.append(c)
-    elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers
-    elif has_no_jira(c): nojiras.append(c)
+def is_release(commit_title):
+    return re.findall("\[release\]", commit_title.lower()) or\
+        "preparing spark release" in commit_title.lower() or\
+        "preparing development version" in commit_title.lower() or\
+        "CHANGES.txt" in commit_title
+def is_maintenance(commit_title):
+    return "maintenance" in commit_title.lower() or\
+      "manually close" in commit_title.lower()
+def has_no_jira(commit_title):
+    return not re.findall("SPARK-[0-9]+", commit_title.upper())
+def is_revert(commit_title):
+    return "revert" in commit_title.lower()
+def is_docs(commit_title):
+    return re.findall("docs*", commit_title.lower()) or\
+        "programming guide" in commit_title.lower()
+for c in new_commits:
+    t = c.get_title()
+    if not t: continue
+    elif is_release(t): releases.append(c)
+    elif is_maintenance(t): maintenance.append(c)
+    elif is_revert(t): reverts.append(c)
+    elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers
+    elif has_no_jira(t): nojiras.append(c)
     else: filtered_commits.append(c)
 
 # Warn against ignored commits
-def print_indented(_list):
-    for x in _list: print "  %s" % x
-if releases or reverts or nojiras:
+if releases or maintenance or reverts or nojiras:
     print "\n=================================================================================="
-    if releases: print "Releases (%d)" % len(releases); print_indented(releases)
-    if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
-    if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
+    if releases: print "Found %d release commits" % len(releases)
+    if maintenance: print "Found %d maintenance commits" % len(maintenance)
+    if reverts: print "Found %d revert commits" % len(reverts)
+    if nojiras: print "Found %d commits with no JIRA" % len(nojiras)
+    print "* Warning: these commits will be ignored.\n"
+    if yesOrNoPrompt("Show ignored commits?"):
+        if releases: print "Release (%d)" % len(releases); print_indented(releases)
+        if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance)
+        if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts)
+        if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
     print "==================== Warning: the above commits will be ignored ==================\n"
-response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
-if response.lower() != "y" and response:
+prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
+if not yesOrNoPrompt(prompt_msg):
     sys.exit("Ok, exiting.")
 
 # Keep track of warnings to tell the user at the end
@@ -123,10 +151,11 @@ jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options)
 print "\n=========================== Compiling contributor list ==========================="
 for commit in filtered_commits:
-    commit_hash = re.findall("^[a-z0-9]+", commit)[0]
-    issues = re.findall("SPARK-[0-9]+", commit.upper())
-    author = get_author(commit_hash)
-    author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
+    _hash = commit.get_hash()
+    title = commit.get_title()
+    issues = re.findall("SPARK-[0-9]+", title.upper())
+    author = commit.get_author()
+    date = get_date(_hash)
     # If the author name is invalid, keep track of it along
     # with all associated issues so we can translate it later
     if is_valid_author(author):
@@ -136,9 +165,8 @@ for commit in filtered_commits:
             invalid_authors[author] = set()
         for issue in issues:
             invalid_authors[author].add(issue)
-    date = get_date(commit_hash)
-    # Parse components from the commit message, if any
-    commit_components = find_components(commit, commit_hash)
+    # Parse components from the commit title, if any
+    commit_components = find_components(title, _hash)
     # Populate or merge an issue into author_info[author]
     def populate(issue_type, components):
         components = components or [CORE_COMPONENT] # assume core if no components provided
@@ -153,14 +181,14 @@ for commit in filtered_commits:
         jira_issue = jira_client.issue(issue)
         jira_type = jira_issue.fields.issuetype.name
         jira_type = translate_issue_type(jira_type, issue, warnings)
-        jira_components = [translate_component(c.name, commit_hash, warnings)\
+        jira_components = [translate_component(c.name, _hash, warnings)\
             for c in jira_issue.fields.components]
         all_components = set(jira_components + commit_components)
         populate(jira_type, all_components)
     # For docs without an associated JIRA, manually add it ourselves
-    if is_docs(commit) and not issues:
+    if is_docs(title) and not issues:
         populate("documentation", commit_components)
-    print "  Processed commit %s authored by %s on %s" % (commit_hash, author, date)
+    print "  Processed commit %s authored by %s on %s" % (_hash, author, date)
 print "==================================================================================\n"
 
 # Write to contributors file ordered by author names
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 76a10c3288..18e16bcb90 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -19,6 +19,7 @@
 # This file contains helper methods used in creating a release.
 
 import re
+import sys
 from subprocess import Popen, PIPE
 
 try:
@@ -47,20 +48,85 @@ except ImportError:
 # Contributors list file name
 contributors_file_name = "contributors.txt"
 
+# Prompt the user to answer yes or no until they do so
+def yesOrNoPrompt(msg):
+    response = raw_input("%s [y/n]: " % msg)
+    while response != "y" and response != "n":
+        return yesOrNoPrompt(msg)
+    return response == "y"
+
 # Utility functions run git commands (written with Git 1.8.5)
 def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def get_author(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash])
+def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1]
 def get_date(commit_hash):
     return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
-def get_one_line(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash])
-def get_one_line_commits(start_hash, end_hash):
-    return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-def num_commits_in_range(start_hash, end_hash):
-    output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-    lines = [line for line in output.split("\n") if line] # filter out empty lines
-    return len(lines)
+def tag_exists(tag):
+    stderr = run_cmd_error(["git", "checkout", tag])
+    return "error" not in stderr
+
+# A type-safe representation of a commit
+class Commit:
+    def __init__(self, _hash, author, title, pr_number = None):
+        self._hash = _hash
+        self.author = author
+        self.title = title
+        self.pr_number = pr_number
+    def get_hash(self): return self._hash
+    def get_author(self): return self.author
+    def get_title(self): return self.title
+    def get_pr_number(self): return self.pr_number
+    def __str__(self):
+        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
+        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)
+
+# Return all commits that belong to the specified tag.
+#
+# Under the hood, this runs a `git log` on that tag and parses the fields
+# from the command output to construct a list of Commit objects. Note that
+# because certain fields reside in the commit description and cannot be parsed
+# through the Github API itself, we need to do some intelligent regex parsing
+# to extract those fields.
+#
+# This is written using Git 1.8.5.
+def get_commits(tag):
+    commit_start_marker = "|=== COMMIT START MARKER ===|"
+    commit_end_marker = "|=== COMMIT END MARKER ===|"
+    field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
+    log_format =\
+        commit_start_marker + "%h" +\
+        field_end_marker + "%an" +\
+        field_end_marker + "%s" +\
+        commit_end_marker + "%b"
+    output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
+    commits = []
+    raw_commits = [c for c in output.split(commit_start_marker) if c]
+    for commit in raw_commits:
+        if commit.count(commit_end_marker) != 1:
+            print "Commit end marker not found in commit: "
+            for line in commit.split("\n"): print line
+            sys.exit(1)
+        # Separate commit digest from the body
+        # From the digest we extract the hash, author and the title
+        # From the body, we extract the PR number and the github username
+        [commit_digest, commit_body] = commit.split(commit_end_marker)
+        if commit_digest.count(field_end_marker) != 2:
+            sys.exit("Unexpected format in commit: %s" % commit_digest)
+        [_hash, author, title] = commit_digest.split(field_end_marker)
+        # The PR number and github username is in the commit message
+        # itself and cannot be accessed through any Github API
+        pr_number = None
+        match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
+        if match:
+            [pr_number, github_username] = match.groups()
+            # If the author name is not valid, use the github
+            # username so we can translate it properly later
+            if not is_valid_author(author):
+                author = github_username
+        # Guard against special characters
+        author = unidecode.unidecode(unicode(author, "UTF-8")).strip()
+        commit = Commit(_hash, author, title, pr_number)
+        commits.append(commit)
+    return commits
 
 # Maintain a mapping for translating issue types to contributions in the release notes
 # This serves an additional function of warning the user against unknown issue types
@@ -70,10 +136,13 @@ def num_commits_in_range(start_hash, end_hash):
 known_issue_types = {
     "bug": "bug fixes",
     "build": "build fixes",
+    "dependency upgrade": "build fixes",
     "improvement": "improvements",
     "new feature": "new features",
     "documentation": "documentation",
-    "test": "test"
+    "test": "test",
+    "task": "improvement",
+    "sub-task": "improvement"
 }
 
 # Maintain a mapping for translating component names when creating the release notes
@@ -176,8 +245,7 @@ def get_jira_name(author, jira_client):
 # Return whether the given name is in the form <First Name><space><Last Name>
 def is_valid_author(author):
     if not author: return False
-    author_words = len(author.split(" "))
-    return author_words == 2 or author_words == 3
+    return " " in author and not re.findall("[0-9]", author)
 
 # Capitalize the first letter of each word in the given author name
 def capitalize_author(author):
diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
index ef4625b003..462c21142f 100755
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@@ -37,8 +37,11 @@ from releaseutils import *
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
 JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
 JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
+GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
 if not JIRA_USERNAME or not JIRA_PASSWORD:
     sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
+if not GITHUB_API_TOKEN:
+    sys.exit("GITHUB_API_TOKEN must be set")
 
 # Write new contributors list to <old_file_name>.new
 if not os.path.isfile(contributors_file_name):
@@ -62,7 +65,7 @@ if INTERACTIVE_MODE:
 # Setup Github and JIRA clients
 jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
+github_client = Github(GITHUB_API_TOKEN)
 
 # Generate candidates for the given author. This should only be called if the given author
 # name does not represent a full name as this operation is somewhat expensive. Under the
@@ -94,7 +97,14 @@ def generate_candidates(author, issues):
     # Then do the same for the assignee of each of the associated JIRAs
     # Note that a given issue may not have an assignee, or the assignee may not have a full name
     for issue in issues:
-        jira_issue = jira_client.issue(issue)
+        try:
+            jira_issue = jira_client.issue(issue)
+        except JIRAError as e:
+            # Do not exit just because an issue is not found!
+            if e.status_code == 404:
+                warnings.append("Issue %s not found!" % issue)
+                continue
+            raise e
         jira_assignee = jira_issue.fields.assignee
         if jira_assignee:
             user_name = jira_assignee.name
@@ -123,9 +133,10 @@ def generate_candidates(author, issues):
 # In non-interactive mode, this script picks the first valid author name from the candidates
 # If no such name exists, the original name is used (without the JIRA numbers).
 print "\n========================== Translating contributor list =========================="
-for line in contributors_file:
+lines = contributors_file.readlines()
+for i, line in enumerate(lines):
     author = line.split(" - ")[0]
-    print "Processing author %s" % author
+    print "Processing author %s (%d/%d)" % (author, i + 1, len(lines))
     if not author:
         print "    ERROR: Expected the following format <author> - <contributions>"
         print "    ERROR: Actual = %s" % line
@@ -135,30 +146,39 @@ for line in contributors_file:
         candidates = generate_candidates(new_author, issues)
         # Print out potential replacement candidates along with the sources, e.g.
         #   [X] No full name found for Github user andrewor14
+        #   [X] No assignee found for SPARK-1763
         #   [0] Andrew Or - Full name of JIRA user andrewor14
         #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
         #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
-        #   [X] No assignee found for SPARK-1763
-        #   [3] Custom
+        #   [3] andrewor14 - Raw Github username
+        #   [4] Custom
         candidate_names = []
+        bad_prompts = [] # Prompts that can't actually be selected; print these first.
+        good_prompts = [] # Prompts that contain valid choices
         for candidate, source in candidates:
             if candidate == NOT_FOUND:
-                print "    [X] %s" % source
+                bad_prompts.append("    [X] %s" % source)
             else:
                 index = len(candidate_names)
                 candidate_names.append(candidate)
-                print "    [%d] %s - %s" % (index, candidate, source)
-        custom_index = len(candidate_names)
+                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
+        raw_index = len(candidate_names)
+        custom_index = len(candidate_names) + 1
+        for p in bad_prompts: print p
+        if bad_prompts: print "    ---"
+        for p in good_prompts: print p
         # In interactive mode, additionally provide "custom" option and await user response
         if INTERACTIVE_MODE:
+            print "    [%d] %s - Raw Github username" % (raw_index, new_author)
             print "    [%d] Custom" % custom_index
             response = raw_input("    Your choice: ")
-            while not response.isdigit() or int(response) > custom_index:
-                response = raw_input("    Please enter an integer between 0 and %d: " % custom_index)
+            last_index = custom_index
+            while not response.isdigit() or int(response) > last_index:
+                response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
             response = int(response)
             if response == custom_index:
                 new_author = raw_input("    Please type a custom name for this author: ")
-            else:
+            elif response != raw_index:
                 new_author = candidate_names[response]
         # In non-interactive mode, just pick the first candidate
         else:
@@ -175,6 +195,7 @@ for line in contributors_file:
         print "    * Replacing %s with %s" % (author, new_author)
         line = line.replace(author, new_author)
     new_contributors_file.write(line)
+    new_contributors_file.flush()
 print "==================================================================================\n"
 contributors_file.close()
 new_contributors_file.close()
author	Andrew Or <andrew@databricks.com>	2014-12-16 17:55:27 -0800
committer	Andrew Or <andrew@databricks.com>	2014-12-16 18:05:46 -0800
commit	0fb00473904ff3643b6f6848e0faa0deeb1d60f5 (patch)
tree	d98a1db6b868cfdc869831fc44771a3008960a78 /dev
parent	1b6fc237c26d9fcb9d4afc9c93a21f9134231145 (diff)
download	spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.tar.gz spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.tar.bz2 spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.zip