From 5da21f07d862212067719ddaa2fef6e09db21c10 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 2 Dec 2014 16:36:12 -0800 Subject: [Release] Translate unknown author names automatically --- dev/create-release/generate-contributors.py | 36 +++++------ dev/create-release/releaseutils.py | 93 +++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 18 deletions(-) (limited to 'dev') diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index f4bf734081..99c29ef9ff 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,23 +26,11 @@ from releaseutils import * # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) START_COMMIT = os.environ.get("START_COMMIT", "37b100") END_COMMIT = os.environ.get("END_COMMIT", "3693ae") -try: - from jira.client import JIRA -except ImportError: - print "This tool requires the jira-python library" - print "Install using 'sudo pip install jira-python'" - sys.exit(-1) - -try: - import unidecode -except ImportError: - print "This tool requires the unidecode library to decode obscure github usernames" - print "Install using 'sudo pip install unidecode'" - sys.exit(-1) - # If commit range is not specified, prompt the user to provide it if not START_COMMIT or not END_COMMIT: print "A commit range is required to proceed." @@ -52,6 +40,8 @@ if not START_COMMIT or not END_COMMIT: END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ") # Verify provided arguments +if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided") +if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided") start_commit_line = get_one_line(START_COMMIT) end_commit_line = get_one_line(END_COMMIT) num_commits = num_commits_in_range(START_COMMIT, END_COMMIT) @@ -70,6 +60,14 @@ if response.lower() != "y" and response: sys.exit("Ok, exiting") print "==================================================================================\n" +# Setup JIRA and github clients. We use two JIRA clients, one with authentication +# and one without, because authentication is slow and required only when we query +# JIRA user details but not Spark issues +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options) +jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) +github_client = Github() + # Find all commits within this range print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT) commits = get_one_line_commits(START_COMMIT, END_COMMIT) @@ -129,14 +127,16 @@ warnings = [] # } # author_info = {} -jira_options = { "server": JIRA_API_BASE } -jira = JIRA(jira_options) print "\n=========================== Compiling contributor list ===========================" for commit in filtered_commits: commit_hash = re.findall("^[a-z0-9]+", commit)[0] issues = re.findall("SPARK-[0-9]+", commit.upper()) + # Translate the author in case the github username is not an actual name + # Also guard against any special characters used in the name + # Note the JIRA client we use here must have authentication enabled author = get_author(commit_hash) - author = unidecode.unidecode(unicode(author, "UTF-8")) # guard against special characters + author = unidecode.unidecode(unicode(author, "UTF-8")) + author = translate_author(author, github_client, jira_client_auth, warnings) date = get_date(commit_hash) # Parse components from the commit message, if any commit_components = find_components(commit, commit_hash) @@ -151,7 +151,7 @@ for commit in filtered_commits: author_info[author][issue_type].add(component) # Find issues and components associated with this commit for issue in issues: - jira_issue = jira.issue(issue) + jira_issue = jira_client.issue(issue) jira_type = jira_issue.fields.issuetype.name jira_type = translate_issue_type(jira_type, issue, warnings) jira_components = [translate_component(c.name, commit_hash, warnings)\ diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index e56d7fa58f..0d6830b11d 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -21,6 +21,29 @@ import re from subprocess import Popen, PIPE +try: + from jira.client import JIRA + from jira.exceptions import JIRAError +except ImportError: + print "This tool requires the jira-python library" + print "Install using 'sudo pip install jira-python'" + sys.exit(-1) + +try: + from github import Github + from github import GithubException +except ImportError: + print "This tool requires the PyGithub library" + print "Install using 'sudo pip install PyGithub'" + sys.exit(-1) + +try: + import unidecode +except ImportError: + print "This tool requires the unidecode library to decode obscure github usernames" + print "Install using 'sudo pip install unidecode'" + sys.exit(-1) + # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] def get_author(commit_hash): @@ -122,3 +145,73 @@ def nice_join(str_list): else: return ", ".join(str_list[:-1]) + ", and " + str_list[-1] +# Return the full name of the specified user on Github +# If the user doesn't exist, return None +def get_github_name(author, github_client): + if github_client: + try: + return github_client.get_user(author).name + except GithubException as e: + # If this is not a "not found" exception + if e.status != 404: + raise e + return None + +# Return the full name of the specified user on JIRA +# If the user doesn't exist, return None +def get_jira_name(author, jira_client): + if jira_client: + try: + return jira_client.user(author).displayName + except JIRAError as e: + # If this is not a "not found" exception + if e.status_code != 404: + raise e + return None + +# Return whether the given name is in the form +def is_valid_author(author): + if not author: return False + author_words = len(author.split(" ")) + return author_words == 2 or author_words == 3 + +# Capitalize the first letter of each word in the given author name +def capitalize_author(author): + if not author: return None + words = author.split(" ") + words = [w[0].capitalize() + w[1:] for w in words if w] + return " ".join(words) + +# Maintain a mapping of translated author names as a cache +translated_authors = {} + +# Format the given author in a format appropriate for the contributors list. +# If the author is not an actual name, search github and JIRA for potential +# replacements and log all candidates as a warning. +def translate_author(github_author, github_client, jira_client, warnings): + if is_valid_author(github_author): + return capitalize_author(github_author) + # If the translated author is already cached, just return it + if github_author in translated_authors: + return translated_authors[github_author] + # Otherwise, author name is not found, so we need to search for an alternative name + candidates = set() + github_name = get_github_name(github_author, github_client) + jira_name = get_jira_name(github_author, jira_client) + if is_valid_author(github_name): github_name = capitalize_author(github_name) + if is_valid_author(jira_name): jira_name = capitalize_author(jira_name) + if github_name: candidates.add(github_name) + if jira_name: candidates.add(jira_name) + # Only use the github name as a replacement automatically + # The JIRA name may not make sense because it can belong to someone else + if is_valid_author(github_name): + candidates_message = " (another candidate is %s)" % jira_name if jira_name else "" + warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message)) + translated_authors[github_name] = github_name + return translated_authors[github_name] + # No direct replacement, so return the original author and list any candidates found + candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else "" + warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message)) + translated_authors[github_author] = github_author + return translated_authors[github_author] + -- cgit v1.2.3