From b85044ecfa825ff68c8e57eeffa4d9f214335e66 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 16 Dec 2014 19:28:43 -0800
Subject: [Release] Cache known author translations locally

This bypasses unnecessary calls to the Github and JIRA API.
Additionally, having a local cache allows us to remember names
that we had to manually discover ourselves.
---
 dev/create-release/generate-contributors.py  | 18 ++++----
 dev/create-release/known_translations        | 59 +++++++++++++++++++++++++
 dev/create-release/releaseutils.py           |  4 +-
 dev/create-release/translate-contributors.py | 64 ++++++++++++++++++++--------
 4 files changed, 116 insertions(+), 29 deletions(-)
 create mode 100644 dev/create-release/known_translations

diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index e8f81ccbce..e65c5d8233 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -26,8 +26,8 @@ from releaseutils import *
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-RELEASE_TAG = os.environ.get("START_COMMIT", "v1.2.0-rc2")
-PREVIOUS_RELEASE_TAG = os.environ.get("END_COMMIT", "v1.1.0")
+RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2")
+PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0")
 
 # If the release tags are not provided, prompt the user to provide them
 while not tag_exists(RELEASE_TAG):
@@ -35,7 +35,7 @@ while not tag_exists(RELEASE_TAG):
 while not tag_exists(PREVIOUS_RELEASE_TAG):
     print "Please specify the previous release tag."
     PREVIOUS_RELEASE_TAG = raw_input(\
-        "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
+      "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ")
 
 # Gather commits found in the new tag but not in the old tag.
 # This filters commits based on both the git hash and the PR number.
@@ -84,9 +84,9 @@ nojiras = []
 filtered_commits = []
 def is_release(commit_title):
     return re.findall("\[release\]", commit_title.lower()) or\
-        "preparing spark release" in commit_title.lower() or\
-        "preparing development version" in commit_title.lower() or\
-        "CHANGES.txt" in commit_title
+      "preparing spark release" in commit_title.lower() or\
+      "preparing development version" in commit_title.lower() or\
+      "CHANGES.txt" in commit_title
 def is_maintenance(commit_title):
     return "maintenance" in commit_title.lower() or\
       "manually close" in commit_title.lower()
@@ -96,7 +96,7 @@ def is_revert(commit_title):
     return "revert" in commit_title.lower()
 def is_docs(commit_title):
     return re.findall("docs*", commit_title.lower()) or\
-        "programming guide" in commit_title.lower()
+      "programming guide" in commit_title.lower()
 for c in new_commits:
     t = c.get_title()
     if not t: continue
@@ -182,7 +182,7 @@ for commit in filtered_commits:
         jira_type = jira_issue.fields.issuetype.name
         jira_type = translate_issue_type(jira_type, issue, warnings)
         jira_components = [translate_component(c.name, _hash, warnings)\
-            for c in jira_issue.fields.components]
+          for c in jira_issue.fields.components]
         all_components = set(jira_components + commit_components)
         populate(jira_type, all_components)
     # For docs without an associated JIRA, manually add it ourselves
@@ -213,7 +213,7 @@ for author in authors:
     # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
     else:
         contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
-            for issue_type, comps in author_info[author].items()]
+          for issue_type, comps in author_info[author].items()]
         contribution = "; ".join(contributions)
     # Do not use python's capitalize() on the whole string to preserve case
     assert contribution
diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations
new file mode 100644
index 0000000000..b74e4ee8a3
--- /dev/null
+++ b/dev/create-release/known_translations
@@ -0,0 +1,59 @@
+# This is a mapping of names to be translated through translate-contributors.py
+# The format expected on each line should be: <old name> - <new name>
+CodingCat - Nan Zhu
+CrazyJvm - Chao Chen
+EugenCepoi - Eugen Cepoi
+GraceH - Jie Huang
+JerryLead - Lijie Xu
+Leolh - Liu Hao
+Lewuathe - Kai Sasaki
+RongGu - Rong Gu
+Shiti - Shiti Saxena
+Victsm - Min Shen
+WangTaoTheTonic - Wang Tao
+XuTingjun - Tingjun Xu
+YanTangZhai - Yantang Zhai
+alexdebrie - Alex DeBrie
+alokito - Alok Saldanha
+anantasty - Anant Asthana
+andrewor14 - Andrew Or
+aniketbhatnagar - Aniket Bhatnagar
+arahuja - Arun Ahuja
+brkyvz - Burak Yavuz
+chesterxgchen - Chester Chen
+chiragaggarwal - Chirag Aggarwal
+chouqin - Qiping Li
+cocoatomo - Tomohiko K.
+coderfi - Fairiz Azizi
+coderxiang - Shuo Xiang
+davies - Davies Liu
+epahomov - Egor Pahomov
+falaki - Hossein Falaki
+freeman-lab - Jeremy Freeman
+industrial-sloth - Jascha Swisher
+jackylk - Jacky Li
+jayunit100 - Jay Vyas
+jerryshao - Saisai Shao
+jkbradley - Joseph Bradley
+lianhuiwang - Lianhui Wang
+lirui-intel - Rui Li
+luluorta - Lu Lu
+luogankun - Gankun Luo
+maji2014 - Derek Ma
+mccheah - Matthew Cheah
+mengxr - Xiangrui Meng
+nartz - Nathan Artz
+odedz - Oded Zimerman
+ravipesala - Ravindra Pesala
+roxchkplusony - Victor Tso
+scwf - Wang Fei
+shimingfei - Shiming Fei
+surq - Surong Quan
+suyanNone - Su Yan
+tedyu - Ted Yu
+tigerquoll - Dale Richardson
+wangxiaojing - Xiaojing Wang
+watermen - Yadong Qi
+witgo - Guoqiang Li
+xinyunh - Xinyun Huang
+zsxwing - Shixiong Zhu
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 18e16bcb90..26221b2703 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -57,11 +57,11 @@ def yesOrNoPrompt(msg):
 
 # Utility functions run git commands (written with Git 1.8.5)
 def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1]
+def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1]
 def get_date(commit_hash):
     return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
 def tag_exists(tag):
-    stderr = run_cmd_error(["git", "checkout", tag])
+    stderr = run_cmd_error(["git", "show", tag])
     return "error" not in stderr
 
 # A type-safe representation of a commit
diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
index 462c21142f..f3b1efdd42 100755
--- a/dev/create-release/translate-contributors.py
+++ b/dev/create-release/translate-contributors.py
@@ -67,6 +67,19 @@ jira_options = { "server": JIRA_API_BASE }
 jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
 github_client = Github(GITHUB_API_TOKEN)
 
+# Load known author translations that are cached locally
+known_translations = {}
+known_translations_file_name = "known_translations"
+known_translations_file = open(known_translations_file_name, "r")
+for line in known_translations_file:
+    if line.startswith("#"): continue
+    [old_name, new_name] = line.split(" - ")
+    known_translations[old_name] = new_name
+known_translations_file.close()
+
+# Open again in case the user adds new mappings
+known_translations_file = open(known_translations_file_name, "a")
+
 # Generate candidates for the given author. This should only be called if the given author
 # name does not represent a full name as this operation is somewhat expensive. Under the
 # hood, it makes several calls to the Github and JIRA API servers to find the candidates.
@@ -83,17 +96,17 @@ NOT_FOUND = "Not found"
 def generate_candidates(author, issues):
     candidates = []
     # First check for full name of Github user
-    github_name = get_github_name(new_author, github_client)
+    github_name = get_github_name(author, github_client)
     if github_name:
-        candidates.append((github_name, "Full name of Github user %s" % new_author))
+        candidates.append((github_name, "Full name of Github user %s" % author))
     else:
-        candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author))
+        candidates.append((NOT_FOUND, "No full name found for Github user %s" % author))
     # Then do the same for JIRA user
-    jira_name = get_jira_name(new_author, jira_client)
+    jira_name = get_jira_name(author, jira_client)
     if jira_name:
-        candidates.append((jira_name, "Full name of JIRA user %s" % new_author))
+        candidates.append((jira_name, "Full name of JIRA user %s" % author))
     else:
-        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author))
+        candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author))
     # Then do the same for the assignee of each of the associated JIRAs
     # Note that a given issue may not have an assignee, or the assignee may not have a full name
     for issue in issues:
@@ -135,15 +148,24 @@ def generate_candidates(author, issues):
 print "\n========================== Translating contributor list =========================="
 lines = contributors_file.readlines()
 for i, line in enumerate(lines):
-    author = line.split(" - ")[0]
-    print "Processing author %s (%d/%d)" % (author, i + 1, len(lines))
-    if not author:
-        print "    ERROR: Expected the following format <author> - <contributions>"
-        print "    ERROR: Actual = %s" % line
-    if not is_valid_author(author):
-        new_author = author.split("/")[0]
-        issues = author.split("/")[1:]
-        candidates = generate_candidates(new_author, issues)
+    temp_author = line.split(" - ")[0]
+    print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines))
+    if not temp_author:
+        error_msg = "    ERROR: Expected the following format <author> - <contributions>\n"
+        error_msg += "    ERROR: Actual = %s" % line
+        print error_msg
+        warnings.append(error_msg)
+        new_contributors_file.write(line)
+        new_contributors_file.flush()
+        continue
+    author = temp_author.split("/")[0]
+    # Use the local copy of known translations where possible
+    if author in known_translations:
+        line = line.replace(temp_author, known_translations[author])
+    elif not is_valid_author(author):
+        new_author = author
+        issues = temp_author.split("/")[1:]
+        candidates = generate_candidates(author, issues)
         # Print out potential replacement candidates along with the sources, e.g.
         #   [X] No full name found for Github user andrewor14
         #   [X] No assignee found for SPARK-1763
@@ -169,7 +191,7 @@ for i, line in enumerate(lines):
         for p in good_prompts: print p
         # In interactive mode, additionally provide "custom" option and await user response
         if INTERACTIVE_MODE:
-            print "    [%d] %s - Raw Github username" % (raw_index, new_author)
+            print "    [%d] %s - Raw Github username" % (raw_index, author)
             print "    [%d] Custom" % custom_index
             response = raw_input("    Your choice: ")
             last_index = custom_index
@@ -191,9 +213,15 @@ for i, line in enumerate(lines):
         if is_valid_author(new_author):
             new_author = capitalize_author(new_author)
         else:
-            warnings.append("Unable to find a valid name %s for author %s" % (new_author, author))
+            warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author))
         print "    * Replacing %s with %s" % (author, new_author)
-        line = line.replace(author, new_author)
+        # If we are in interactive mode, prompt the user whether we want to remember this new mapping
+        if INTERACTIVE_MODE and\
+          author not in known_translations and\
+          yesOrNoPrompt("    Add mapping %s -> %s to known translations file?" % (author, new_author)):
+            known_translations_file.write("%s - %s\n" % (author, new_author))
+            known_translations_file.flush()
+        line = line.replace(temp_author, author)
     new_contributors_file.write(line)
     new_contributors_file.flush()
 print "==================================================================================\n"
-- 
cgit v1.2.3