From b85044ecfa825ff68c8e57eeffa4d9f214335e66 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 16 Dec 2014 19:28:43 -0800 Subject: [Release] Cache known author translations locally This bypasses unnecessary calls to the Github and JIRA API. Additionally, having a local cache allows us to remember names that we had to manually discover ourselves. --- dev/create-release/generate-contributors.py | 18 ++++---- dev/create-release/known_translations | 59 +++++++++++++++++++++++++ dev/create-release/releaseutils.py | 4 +- dev/create-release/translate-contributors.py | 64 ++++++++++++++++++++-------- 4 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 dev/create-release/known_translations diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index e8f81ccbce..e65c5d8233 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,8 +26,8 @@ from releaseutils import * # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -RELEASE_TAG = os.environ.get("START_COMMIT", "v1.2.0-rc2") -PREVIOUS_RELEASE_TAG = os.environ.get("END_COMMIT", "v1.1.0") +RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2") +PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0") # If the release tags are not provided, prompt the user to provide them while not tag_exists(RELEASE_TAG): @@ -35,7 +35,7 @@ while not tag_exists(RELEASE_TAG): while not tag_exists(PREVIOUS_RELEASE_TAG): print "Please specify the previous release tag." PREVIOUS_RELEASE_TAG = raw_input(\ - "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") + "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") # Gather commits found in the new tag but not in the old tag. # This filters commits based on both the git hash and the PR number. @@ -84,9 +84,9 @@ nojiras = [] filtered_commits = [] def is_release(commit_title): return re.findall("\[release\]", commit_title.lower()) or\ - "preparing spark release" in commit_title.lower() or\ - "preparing development version" in commit_title.lower() or\ - "CHANGES.txt" in commit_title + "preparing spark release" in commit_title.lower() or\ + "preparing development version" in commit_title.lower() or\ + "CHANGES.txt" in commit_title def is_maintenance(commit_title): return "maintenance" in commit_title.lower() or\ "manually close" in commit_title.lower() @@ -96,7 +96,7 @@ def is_revert(commit_title): return "revert" in commit_title.lower() def is_docs(commit_title): return re.findall("docs*", commit_title.lower()) or\ - "programming guide" in commit_title.lower() + "programming guide" in commit_title.lower() for c in new_commits: t = c.get_title() if not t: continue @@ -182,7 +182,7 @@ for commit in filtered_commits: jira_type = jira_issue.fields.issuetype.name jira_type = translate_issue_type(jira_type, issue, warnings) jira_components = [translate_component(c.name, _hash, warnings)\ - for c in jira_issue.fields.components] + for c in jira_issue.fields.components] all_components = set(jira_components + commit_components) populate(jira_type, all_components) # For docs without an associated JIRA, manually add it ourselves @@ -213,7 +213,7 @@ for author in authors: # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN else: contributions = ["%s in %s" % (issue_type, nice_join(comps)) \ - for issue_type, comps in author_info[author].items()] + for issue_type, comps in author_info[author].items()] contribution = "; ".join(contributions) # Do not use python's capitalize() on the whole string to preserve case assert contribution diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations new file mode 100644 index 0000000000..b74e4ee8a3 --- /dev/null +++ b/dev/create-release/known_translations @@ -0,0 +1,59 @@ +# This is a mapping of names to be translated through translate-contributors.py +# The format expected on each line should be: - +CodingCat - Nan Zhu +CrazyJvm - Chao Chen +EugenCepoi - Eugen Cepoi +GraceH - Jie Huang +JerryLead - Lijie Xu +Leolh - Liu Hao +Lewuathe - Kai Sasaki +RongGu - Rong Gu +Shiti - Shiti Saxena +Victsm - Min Shen +WangTaoTheTonic - Wang Tao +XuTingjun - Tingjun Xu +YanTangZhai - Yantang Zhai +alexdebrie - Alex DeBrie +alokito - Alok Saldanha +anantasty - Anant Asthana +andrewor14 - Andrew Or +aniketbhatnagar - Aniket Bhatnagar +arahuja - Arun Ahuja +brkyvz - Burak Yavuz +chesterxgchen - Chester Chen +chiragaggarwal - Chirag Aggarwal +chouqin - Qiping Li +cocoatomo - Tomohiko K. +coderfi - Fairiz Azizi +coderxiang - Shuo Xiang +davies - Davies Liu +epahomov - Egor Pahomov +falaki - Hossein Falaki +freeman-lab - Jeremy Freeman +industrial-sloth - Jascha Swisher +jackylk - Jacky Li +jayunit100 - Jay Vyas +jerryshao - Saisai Shao +jkbradley - Joseph Bradley +lianhuiwang - Lianhui Wang +lirui-intel - Rui Li +luluorta - Lu Lu +luogankun - Gankun Luo +maji2014 - Derek Ma +mccheah - Matthew Cheah +mengxr - Xiangrui Meng +nartz - Nathan Artz +odedz - Oded Zimerman +ravipesala - Ravindra Pesala +roxchkplusony - Victor Tso +scwf - Wang Fei +shimingfei - Shiming Fei +surq - Surong Quan +suyanNone - Su Yan +tedyu - Ted Yu +tigerquoll - Dale Richardson +wangxiaojing - Xiaojing Wang +watermen - Yadong Qi +witgo - Guoqiang Li +xinyunh - Xinyun Huang +zsxwing - Shixiong Zhu diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 18e16bcb90..26221b2703 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -57,11 +57,11 @@ def yesOrNoPrompt(msg): # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] -def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1] +def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1] def get_date(commit_hash): return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) def tag_exists(tag): - stderr = run_cmd_error(["git", "checkout", tag]) + stderr = run_cmd_error(["git", "show", tag]) return "error" not in stderr # A type-safe representation of a commit diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 462c21142f..f3b1efdd42 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -67,6 +67,19 @@ jira_options = { "server": JIRA_API_BASE } jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) github_client = Github(GITHUB_API_TOKEN) +# Load known author translations that are cached locally +known_translations = {} +known_translations_file_name = "known_translations" +known_translations_file = open(known_translations_file_name, "r") +for line in known_translations_file: + if line.startswith("#"): continue + [old_name, new_name] = line.split(" - ") + known_translations[old_name] = new_name +known_translations_file.close() + +# Open again in case the user adds new mappings +known_translations_file = open(known_translations_file_name, "a") + # Generate candidates for the given author. This should only be called if the given author # name does not represent a full name as this operation is somewhat expensive. Under the # hood, it makes several calls to the Github and JIRA API servers to find the candidates. @@ -83,17 +96,17 @@ NOT_FOUND = "Not found" def generate_candidates(author, issues): candidates = [] # First check for full name of Github user - github_name = get_github_name(new_author, github_client) + github_name = get_github_name(author, github_client) if github_name: - candidates.append((github_name, "Full name of Github user %s" % new_author)) + candidates.append((github_name, "Full name of Github user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author)) + candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) # Then do the same for JIRA user - jira_name = get_jira_name(new_author, jira_client) + jira_name = get_jira_name(author, jira_client) if jira_name: - candidates.append((jira_name, "Full name of JIRA user %s" % new_author)) + candidates.append((jira_name, "Full name of JIRA user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author)) + candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author)) # Then do the same for the assignee of each of the associated JIRAs # Note that a given issue may not have an assignee, or the assignee may not have a full name for issue in issues: @@ -135,15 +148,24 @@ def generate_candidates(author, issues): print "\n========================== Translating contributor list ==========================" lines = contributors_file.readlines() for i, line in enumerate(lines): - author = line.split(" - ")[0] - print "Processing author %s (%d/%d)" % (author, i + 1, len(lines)) - if not author: - print " ERROR: Expected the following format - " - print " ERROR: Actual = %s" % line - if not is_valid_author(author): - new_author = author.split("/")[0] - issues = author.split("/")[1:] - candidates = generate_candidates(new_author, issues) + temp_author = line.split(" - ")[0] + print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) + if not temp_author: + error_msg = " ERROR: Expected the following format - \n" + error_msg += " ERROR: Actual = %s" % line + print error_msg + warnings.append(error_msg) + new_contributors_file.write(line) + new_contributors_file.flush() + continue + author = temp_author.split("/")[0] + # Use the local copy of known translations where possible + if author in known_translations: + line = line.replace(temp_author, known_translations[author]) + elif not is_valid_author(author): + new_author = author + issues = temp_author.split("/")[1:] + candidates = generate_candidates(author, issues) # Print out potential replacement candidates along with the sources, e.g. # [X] No full name found for Github user andrewor14 # [X] No assignee found for SPARK-1763 @@ -169,7 +191,7 @@ for i, line in enumerate(lines): for p in good_prompts: print p # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: - print " [%d] %s - Raw Github username" % (raw_index, new_author) + print " [%d] %s - Raw Github username" % (raw_index, author) print " [%d] Custom" % custom_index response = raw_input(" Your choice: ") last_index = custom_index @@ -191,9 +213,15 @@ for i, line in enumerate(lines): if is_valid_author(new_author): new_author = capitalize_author(new_author) else: - warnings.append("Unable to find a valid name %s for author %s" % (new_author, author)) + warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author)) print " * Replacing %s with %s" % (author, new_author) - line = line.replace(author, new_author) + # If we are in interactive mode, prompt the user whether we want to remember this new mapping + if INTERACTIVE_MODE and\ + author not in known_translations and\ + yesOrNoPrompt(" Add mapping %s -> %s to known translations file?" % (author, new_author)): + known_translations_file.write("%s - %s\n" % (author, new_author)) + known_translations_file.flush() + line = line.replace(temp_author, author) new_contributors_file.write(line) new_contributors_file.flush() print "==================================================================================\n" -- cgit v1.2.3