3 files changed, 206 insertions, 89 deletions
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index a3b78a3eac..e8f81ccbce 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -26,75 +26,103 @@ from releaseutils import *
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-START_COMMIT = os.environ.get("START_COMMIT", "37b100")
-END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
-
-# If commit range is not specified, prompt the user to provide it
-if not START_COMMIT or not END_COMMIT:
-    print "A commit range is required to proceed."
-    if not START_COMMIT:
-        START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ")
-    if not END_COMMIT:
-        END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
-
-# Verify provided arguments
-start_commit_line = get_one_line(START_COMMIT)
-end_commit_line = get_one_line(END_COMMIT)
-num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
-if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT)
-if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT)
-if num_commits == 0:
-    sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT))
+RELEASE_TAG = os.environ.get("START_COMMIT", "v1.2.0-rc2")
+PREVIOUS_RELEASE_TAG = os.environ.get("END_COMMIT", "v1.1.0")
+
+# If the release tags are not provided, prompt the user to provide them
+while not tag_exists(RELEASE_TAG):
+    RELEASE_TAG = raw_input("Please provide a valid release tag: ")
+while not tag_exists(PREVIOUS_RELEASE_TAG):
+    print "Please specify the previous release tag."
+    PREVIOUS_RELEASE_TAG = raw_input(\
+        "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
+
+# Gather commits found in the new tag but not in the old tag.
+# This filters commits based on both the git hash and the PR number.
+# If either is present in the old tag, then we ignore the commit.
+print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)
+release_commits = get_commits(RELEASE_TAG)
+previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG)
+previous_release_hashes = set()
+previous_release_prs = set()
+for old_commit in previous_release_commits:
+    previous_release_hashes.add(old_commit.get_hash())
+    if old_commit.get_pr_number():
+        previous_release_prs.add(old_commit.get_pr_number())
+new_commits = []
+for this_commit in release_commits:
+    this_hash = this_commit.get_hash()
+    this_pr_number = this_commit.get_pr_number()
+    if this_hash in previous_release_hashes:
+        continue
+    if this_pr_number and this_pr_number in previous_release_prs:
+        continue
+    new_commits.append(this_commit)
+if not new_commits:
+    sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG))
+
+# Prompt the user for confirmation that the commit range is correct
 print "\n=================================================================================="
 print "JIRA server: %s" % JIRA_API_BASE
-print "Start commit (inclusive): %s" % start_commit_line
-print "End commit (non-inclusive): %s" % end_commit_line
-print "Number of commits in this range: %s" % num_commits
+print "Release tag: %s" % RELEASE_TAG
+print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG
+print "Number of commits in this range: %s" % len(new_commits)
 print
-response = raw_input("Is this correct? [Y/n] ")
-if response.lower() != "y" and response:
-    sys.exit("Ok, exiting")
+def print_indented(_list):
+    for x in _list: print "  %s" % x
+if yesOrNoPrompt("Show all commits?"):
+    print_indented(new_commits)
 print "==================================================================================\n"
-
-# Find all commits within this range
-print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
-commits = get_one_line_commits(START_COMMIT, END_COMMIT)
-if not commits: sys.exit("Error: No commits found within this range!")
-commits = commits.split("\n")
+if not yesOrNoPrompt("Does this look correct?"):
+    sys.exit("Ok, exiting")
 
 # Filter out special commits
 releases = []
+maintenance = []
 reverts = []
 nojiras = []
 filtered_commits = []
-def is_release(commit):
-    return re.findall("\[release\]", commit.lower()) or\
-        "maven-release-plugin" in commit or "CHANGES.txt" in commit
-def has_no_jira(commit):
-    return not re.findall("SPARK-[0-9]+", commit.upper())
-def is_revert(commit):
-    return "revert" in commit.lower()
-def is_docs(commit):
-    return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower()
-for c in commits:
-    if not c: continue
-    elif is_release(c): releases.append(c)
-    elif is_revert(c): reverts.append(c)
-    elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers
-    elif has_no_jira(c): nojiras.append(c)
+def is_release(commit_title):
+    return re.findall("\[release\]", commit_title.lower()) or\
+        "preparing spark release" in commit_title.lower() or\
+        "preparing development version" in commit_title.lower() or\
+        "CHANGES.txt" in commit_title
+def is_maintenance(commit_title):
+    return "maintenance" in commit_title.lower() or\
+      "manually close" in commit_title.lower()
+def has_no_jira(commit_title):
+    return not re.findall("SPARK-[0-9]+", commit_title.upper())
+def is_revert(commit_title):
+    return "revert" in commit_title.lower()
+def is_docs(commit_title):
+    return re.findall("docs*", commit_title.lower()) or\
+        "programming guide" in commit_title.lower()
+for c in new_commits:
+    t = c.get_title()
+    if not t: continue
+    elif is_release(t): releases.append(c)
+    elif is_maintenance(t): maintenance.append(c)
+    elif is_revert(t): reverts.append(c)
+    elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers
+    elif has_no_jira(t): nojiras.append(c)
     else: filtered_commits.append(c)
 
 # Warn against ignored commits
-def print_indented(_list):
-    for x in _list: print "  %s" % x
-if releases or reverts or nojiras:
+if releases or maintenance or reverts or nojiras:
     print "\n=================================================================================="
-    if releases: print "Releases (%d)" % len(releases); print_indented(releases)
-    if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
-    if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
+    if releases: print "Found %d release commits" % len(releases)
+    if maintenance: print "Found %d maintenance commits" % len(maintenance)
+    if reverts: print "Found %d revert commits" % len(reverts)
+    if nojiras: print "Found %d commits with no JIRA" % len(nojiras)
+    print "* Warning: these commits will be ignored.\n"
+    if yesOrNoPrompt("Show ignored commits?"):
+        if releases: print "Release (%d)" % len(releases); print_indented(releases)
+        if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance)
+        if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts)
+        if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
     print "==================== Warning: the above commits will be ignored ==================\n"
-response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
-if response.lower() != "y" and response:
+prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits)
+if not yesOrNoPrompt(prompt_msg):
     sys.exit("Ok, exiting.")
 
 # Keep track of warnings to tell the user at the end
@@ -123,10 +151,11 @@ jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options)
 print "\n=========================== Compiling contributor list ==========================="
 for commit in filtered_commits:
-    commit_hash = re.findall("^[a-z0-9]+", commit)[0]
-    issues = re.findall("SPARK-[0-9]+", commit.upper())
-    author = get_author(commit_hash)
-    author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
+    _hash = commit.get_hash()
+    title = commit.get_title()
+    issues = re.findall("SPARK-[0-9]+", title.upper())
+    author = commit.get_author()
+    date = get_date(_hash)
     # If the author name is invalid, keep track of it along
     # with all associated issues so we can translate it later
     if is_valid_author(author):
@@ -136,9 +165,8 @@ for commit in filtered_commits:
             invalid_authors[author] = set()
         for issue in issues:
             invalid_authors[author].add(issue)
-    date = get_date(commit_hash)
-    # Parse components from the commit message, if any
-    commit_components = find_components(commit, commit_hash)
+    # Parse components from the commit title, if any
+    commit_components = find_components(title, _hash)
     # Populate or merge an issue into author_info[author]
     def populate(issue_type, components):
         components = components or [CORE_COMPONENT] # assume core if no components provided
@@ -153,14 +181,14 @@ for commit in filtered_commits:
         jira_issue = jira_client.issue(issue)
         jira_type = jira_issue.fields.issuetype.name
         jira_type = translate_issue_type(jira_type, issue, warnings)
-        jira_components = [translate_component(c.name, commit_hash, warnings)\
+        jira_components = [translate_component(c.name, _hash, warnings)\
             for c in jira_issue.fields.components]
         all_components = set(jira_components + commit_components)
         populate(jira_type, all_components)
     # For docs without an associated JIRA, manually add it ourselves
-    if is_docs(commit) and not issues:
+    if is_docs(title) and not issues:
         populate("documentation", commit_components)
-    print "  Processed commit %s authored by %s on %s" % (commit_hash, author, date)
+    print "  Processed commit %s authored by %s on %s" % (_hash, author, date)
 print "==================================================================================\n"
 
 # Write to contributors file ordered by author names
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 76a10c3288..18e16bcb90 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -19,6 +19,7 @@
 # This file contains helper methods used in creating a release.
 
 import re
+import sys
 from subprocess import Popen, PIPE
 
 try:
@@ -47,20 +48,85 @@ except ImportError:
 # Contributors list file name
 contributors_file_name = "contributors.txt"
 
+# Prompt the user to answer yes or no until they do so
+def yesOrNoPrompt(msg):
+    response = raw_input("%s [y/n]: " % msg)
+    while response != "y" and response != "n":
+        return yesOrNoPrompt(msg)
+    return response == "y"
+
 # Utility functions run git commands (written with Git 1.8.5)
 def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def get_author(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash])
+def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1]
 def get_date(commit_hash):
     return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
-def get_one_line(commit_hash):
-    return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash])
-def get_one_line_commits(start_hash, end_hash):
-    return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-def num_commits_in_range(start_hash, end_hash):
-    output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-    lines = [line for line in output.split("\n") if line] # filter out empty lines
-    return len(lines)
+def tag_exists(tag):
+    stderr = run_cmd_error(["git", "checkout", tag])
+    return "error" not in stderr
+
+# A type-safe representation of a commit
+class Commit:
+    def __init__(self, _hash, author, title, pr_number = None):
+        self._hash = _hash
+        self.author = author
+        self.title = title
+        self.pr_number = pr_number
+    def get_hash(self): return self._hash
+    def get_author(self): return self.author
+    def get_title(self): return self.title
+    def get_pr_number(self): return self.pr_number
+    def __str__(self):
+        closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
+        return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)
+
+# Return all commits that belong to the specified tag.
+#
+# Under the hood, this runs a `git log` on that tag and parses the fields
+# from the command output to construct a list of Commit objects. Note that
+# because certain fields reside in the commit description and cannot be parsed
+# through the Github API itself, we need to do some intelligent regex parsing
+# to extract those fields.
+#
+# This is written using Git 1.8.5.
+def get_commits(tag):
+    commit_start_marker = "|=== COMMIT START MARKER ===|"
+    commit_end_marker = "|=== COMMIT END MARKER ===|"
+    field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
+    log_format =\
+        commit_start_marker + "%h" +\
+        field_end_marker + "%an" +\
+        field_end_marker + "%s" +\
+        commit_end_marker + "%b"
+    output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
+    commits = []
+    raw_commits = [c for c in output.split(commit_start_marker) if c]
+    for commit in raw_commits:
+        if commit.count(commit_end_marker) != 1:
+            print "Commit end marker not found in commit: "
+            for line in commit.split("\n"): print line
+            sys.exit(1)
+        # Separate commit digest from the body
+        # From the digest we extract the hash, author and the title
+        # From the body, we extract the PR number and the github username
+        [commit_digest, commit_body] = commit.split(commit_end_marker)
+        if commit_digest.count(field_end_marker) != 2:
+            sys.exit("Unexpected format in commit: %s" % commit_digest)
+        [_hash, author, title] = commit_digest.split(field_end_marker)
+        # The PR number and github username is in the commit message
+        # itself and cannot be accessed through any Github API
+        pr_number = None
+        match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
+        if match:
+            [pr_number, github_username] = match.groups()
+            # If the author name is not valid, use the github
+            # username so we can translate it properly later
+            if not is_valid_author(author):
+                author = github_username
+        # Guard against special characters
+        author = unidecode.unidecode(unicode(author, "UTF-8")).strip()
+        commit = Commit(_hash, author, title, pr_number)
+        commits.append(commit)
+    return commits
 
 # Maintain a mapping for translating issue types to contributions in the release notes
 # This serves an additional function of warning the user against unknown issue types
@@ -70,10 +136,13 @@ def num_commits_in_range(start_hash, end_hash):
 known_issue_types = {
     "bug": "bug fixes",
     "build": "build fixes",
+    "dependency upgrade": "build fixes",
     "improvement": "improvements",
     "new feature": "new features",
     "documentation": "documentation",
-    "test": "test"
+    "test": "test",
+    "task": "improvement",
+    "sub-task": "improvement"
 }
 
 # Maintain a mapping for translating component names when creating the release notes
@@ -176,8 +245,7 @@ def get_jira_name(author, jira_client):
 # Return whether the given name is in the form <First Name><space><Last Name>
 def is_valid_author(author):
     if not author: return False
-    author_words = len(author.split(" "))
-    return author_words == 2 or author_words == 3
+    return " " in author and not re.findall("[0-9]", author)
 
 # Capitalize the first letter of each word in the given author name
 def capitalize_author(author):
diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
index ef4625b003..462c21142f 100755
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@@ -37,8 +37,11 @@ from releaseutils import *
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
 JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
 JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
+GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
 if not JIRA_USERNAME or not JIRA_PASSWORD:
     sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
+if not GITHUB_API_TOKEN:
+    sys.exit("GITHUB_API_TOKEN must be set")
 
 # Write new contributors list to <old_file_name>.new
 if not os.path.isfile(contributors_file_name):
@@ -62,7 +65,7 @@ if INTERACTIVE_MODE:
 # Setup Github and JIRA clients
 jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
+github_client = Github(GITHUB_API_TOKEN)
 
 # Generate candidates for the given author. This should only be called if the given author
 # name does not represent a full name as this operation is somewhat expensive. Under the
@@ -94,7 +97,14 @@ def generate_candidates(author, issues):
     # Then do the same for the assignee of each of the associated JIRAs
     # Note that a given issue may not have an assignee, or the assignee may not have a full name
     for issue in issues:
-        jira_issue = jira_client.issue(issue)
+        try:
+            jira_issue = jira_client.issue(issue)
+        except JIRAError as e:
+            # Do not exit just because an issue is not found!
+            if e.status_code == 404:
+                warnings.append("Issue %s not found!" % issue)
+                continue
+            raise e
         jira_assignee = jira_issue.fields.assignee
         if jira_assignee:
             user_name = jira_assignee.name
@@ -123,9 +133,10 @@ def generate_candidates(author, issues):
 # In non-interactive mode, this script picks the first valid author name from the candidates
 # If no such name exists, the original name is used (without the JIRA numbers).
 print "\n========================== Translating contributor list =========================="
-for line in contributors_file:
+lines = contributors_file.readlines()
+for i, line in enumerate(lines):
     author = line.split(" - ")[0]
-    print "Processing author %s" % author
+    print "Processing author %s (%d/%d)" % (author, i + 1, len(lines))
     if not author:
         print "    ERROR: Expected the following format <author> - <contributions>"
         print "    ERROR: Actual = %s" % line
@@ -135,30 +146,39 @@ for line in contributors_file:
         candidates = generate_candidates(new_author, issues)
         # Print out potential replacement candidates along with the sources, e.g.
         #   [X] No full name found for Github user andrewor14
+        #   [X] No assignee found for SPARK-1763
         #   [0] Andrew Or - Full name of JIRA user andrewor14
         #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
         #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
-        #   [X] No assignee found for SPARK-1763
-        #   [3] Custom
+        #   [3] andrewor14 - Raw Github username
+        #   [4] Custom
         candidate_names = []
+        bad_prompts = [] # Prompts that can't actually be selected; print these first.
+        good_prompts = [] # Prompts that contain valid choices
         for candidate, source in candidates:
             if candidate == NOT_FOUND:
-                print "    [X] %s" % source
+                bad_prompts.append("    [X] %s" % source)
             else:
                 index = len(candidate_names)
                 candidate_names.append(candidate)
-                print "    [%d] %s - %s" % (index, candidate, source)
-        custom_index = len(candidate_names)
+                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
+        raw_index = len(candidate_names)
+        custom_index = len(candidate_names) + 1
+        for p in bad_prompts: print p
+        if bad_prompts: print "    ---"
+        for p in good_prompts: print p
         # In interactive mode, additionally provide "custom" option and await user response
         if INTERACTIVE_MODE:
+            print "    [%d] %s - Raw Github username" % (raw_index, new_author)
             print "    [%d] Custom" % custom_index
             response = raw_input("    Your choice: ")
-            while not response.isdigit() or int(response) > custom_index:
-                response = raw_input("    Please enter an integer between 0 and %d: " % custom_index)
+            last_index = custom_index
+            while not response.isdigit() or int(response) > last_index:
+                response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
             response = int(response)
             if response == custom_index:
                 new_author = raw_input("    Please type a custom name for this author: ")
-            else:
+            elif response != raw_index:
                 new_author = candidate_names[response]
         # In non-interactive mode, just pick the first candidate
         else:
@@ -175,6 +195,7 @@ for line in contributors_file:
         print "    * Replacing %s with %s" % (author, new_author)
         line = line.replace(author, new_author)
     new_contributors_file.write(line)
+    new_contributors_file.flush()
 print "==================================================================================\n"
 contributors_file.close()
 new_contributors_file.close()