[Release] Major improvements to generate contributors script

This commit introduces several major improvements to the script that generates the contributors list for release notes, notably: (1) Use release tags instead of a range of commits. Across branches, commits are not actually strictly two-dimensional, and so it is not sufficient to specify a start hash and an end hash. Otherwise, we end up counting commits that were already merged in an older branch. (2) Match PR numbers in addition to commit hashes. This is related to the first point in that if a PR is already merged in an older minor release tag, it should be filtered out here. This requires us to do some intelligent regex parsing on the commit description in addition to just relying on the GitHub API. (3) Relax author validity check. The old code fails on a name that has many middle names, for instance. The test was just too strict. (4) Use GitHub authentication. This allows us to make far more requests through the GitHub API than before (5000 as opposed to 60 per hour). (5) Translate from Github username, not commit author name. This is important because the commit author name is not always configured correctly by the user. For instance, the username "falaki" used to resolve to just "Hossein", which was treated as a github username and translated to something else that is completely arbitrary. (6) Add an option to use the untranslated name. If there is not a satisfactory candidate to replace the untranslated name with, at least allow the user to not translate it.
author: Andrew Or <andrew@databricks.com> 2014-12-16 17:55:27 -0800
committer: Andrew Or <andrew@databricks.com> 2014-12-16 17:55:27 -0800
commit: 6f80b749e0f23f2f5c83993b87f73c8943b99db8 (patch)
tree: e99c70a854d7714c651c2e99aa70df70111904c4 /dev/create-release/translate-contributors.py
parent: fa66ef6c97e87c9255b67b03836a4ba50598ebae (diff)
download: spark-6f80b749e0f23f2f5c83993b87f73c8943b99db8.tar.gz
spark-6f80b749e0f23f2f5c83993b87f73c8943b99db8.tar.bz2
spark-6f80b749e0f23f2f5c83993b87f73c8943b99db8.zip
1 files changed, 33 insertions, 12 deletions
diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
index ef4625b003..462c21142f 100755
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@@ -37,8 +37,11 @@ from releaseutils import *
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
 JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
 JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
+GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None)
 if not JIRA_USERNAME or not JIRA_PASSWORD:
     sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set")
+if not GITHUB_API_TOKEN:
+    sys.exit("GITHUB_API_TOKEN must be set")
 
 # Write new contributors list to <old_file_name>.new
 if not os.path.isfile(contributors_file_name):
@@ -62,7 +65,7 @@ if INTERACTIVE_MODE:
 # Setup Github and JIRA clients
 jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
+github_client = Github(GITHUB_API_TOKEN)
 
 # Generate candidates for the given author. This should only be called if the given author
 # name does not represent a full name as this operation is somewhat expensive. Under the
@@ -94,7 +97,14 @@ def generate_candidates(author, issues):
     # Then do the same for the assignee of each of the associated JIRAs
     # Note that a given issue may not have an assignee, or the assignee may not have a full name
     for issue in issues:
-        jira_issue = jira_client.issue(issue)
+        try:
+            jira_issue = jira_client.issue(issue)
+        except JIRAError as e:
+            # Do not exit just because an issue is not found!
+            if e.status_code == 404:
+                warnings.append("Issue %s not found!" % issue)
+                continue
+            raise e
         jira_assignee = jira_issue.fields.assignee
         if jira_assignee:
             user_name = jira_assignee.name
@@ -123,9 +133,10 @@ def generate_candidates(author, issues):
 # In non-interactive mode, this script picks the first valid author name from the candidates
 # If no such name exists, the original name is used (without the JIRA numbers).
 print "\n========================== Translating contributor list =========================="
-for line in contributors_file:
+lines = contributors_file.readlines()
+for i, line in enumerate(lines):
     author = line.split(" - ")[0]
-    print "Processing author %s" % author
+    print "Processing author %s (%d/%d)" % (author, i + 1, len(lines))
     if not author:
         print "    ERROR: Expected the following format <author> - <contributions>"
         print "    ERROR: Actual = %s" % line
@@ -135,30 +146,39 @@ for line in contributors_file:
         candidates = generate_candidates(new_author, issues)
         # Print out potential replacement candidates along with the sources, e.g.
         #   [X] No full name found for Github user andrewor14
+        #   [X] No assignee found for SPARK-1763
         #   [0] Andrew Or - Full name of JIRA user andrewor14
         #   [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14
         #   [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14
-        #   [X] No assignee found for SPARK-1763
-        #   [3] Custom
+        #   [3] andrewor14 - Raw Github username
+        #   [4] Custom
         candidate_names = []
+        bad_prompts = [] # Prompts that can't actually be selected; print these first.
+        good_prompts = [] # Prompts that contain valid choices
         for candidate, source in candidates:
             if candidate == NOT_FOUND:
-                print "    [X] %s" % source
+                bad_prompts.append("    [X] %s" % source)
             else:
                 index = len(candidate_names)
                 candidate_names.append(candidate)
-                print "    [%d] %s - %s" % (index, candidate, source)
-        custom_index = len(candidate_names)
+                good_prompts.append("    [%d] %s - %s" % (index, candidate, source))
+        raw_index = len(candidate_names)
+        custom_index = len(candidate_names) + 1
+        for p in bad_prompts: print p
+        if bad_prompts: print "    ---"
+        for p in good_prompts: print p
         # In interactive mode, additionally provide "custom" option and await user response
         if INTERACTIVE_MODE:
+            print "    [%d] %s - Raw Github username" % (raw_index, new_author)
             print "    [%d] Custom" % custom_index
             response = raw_input("    Your choice: ")
-            while not response.isdigit() or int(response) > custom_index:
-                response = raw_input("    Please enter an integer between 0 and %d: " % custom_index)
+            last_index = custom_index
+            while not response.isdigit() or int(response) > last_index:
+                response = raw_input("    Please enter an integer between 0 and %d: " % last_index)
             response = int(response)
             if response == custom_index:
                 new_author = raw_input("    Please type a custom name for this author: ")
-            else:
+            elif response != raw_index:
                 new_author = candidate_names[response]
         # In non-interactive mode, just pick the first candidate
         else:
@@ -175,6 +195,7 @@ for line in contributors_file:
         print "    * Replacing %s with %s" % (author, new_author)
         line = line.replace(author, new_author)
     new_contributors_file.write(line)
+    new_contributors_file.flush()
 print "==================================================================================\n"
 contributors_file.close()
 new_contributors_file.close()
author	Andrew Or <andrew@databricks.com>	2014-12-16 17:55:27 -0800
committer	Andrew Or <andrew@databricks.com>	2014-12-16 17:55:27 -0800
commit	6f80b749e0f23f2f5c83993b87f73c8943b99db8 (patch)
tree	e99c70a854d7714c651c2e99aa70df70111904c4 /dev/create-release/translate-contributors.py
parent	fa66ef6c97e87c9255b67b03836a4ba50598ebae (diff)
download	spark-6f80b749e0f23f2f5c83993b87f73c8943b99db8.tar.gz spark-6f80b749e0f23f2f5c83993b87f73c8943b99db8.tar.bz2 spark-6f80b749e0f23f2f5c83993b87f73c8943b99db8.zip