aboutsummaryrefslogtreecommitdiff
path: root/dev/create-release/generate-contributors.py
diff options
context:
space:
mode:
authorAndrew Or <andrew@databricks.com>2014-12-03 19:08:29 -0800
committerAndrew Or <andrew@databricks.com>2014-12-03 19:19:29 -0800
commitf9e1f89b2500287ff284317fe4504bd32d3b8e1a (patch)
treef3b5041fcca226b23c880e2660b3c97a6189c20b /dev/create-release/generate-contributors.py
parent9880bb481943b45cb5ad981809cf5cbd7b0639bb (diff)
downloadspark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.tar.gz
spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.tar.bz2
spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.zip
[Release] Correctly translate contributors name in release notes
This commit involves three main changes: (1) It separates the translation of contributor names from the generation of the contributors list. This is largely motivated by the Github API limit; even if we exceed this limit, we should at least be able to proceed manually as before. This is why the translation logic is abstracted into its own script translate-contributors.py. (2) When we look for candidate replacements for invalid author names, we should look for the assignees of the associated JIRAs too. As a result, the intermediate file must keep track of these. (3) This provides an interactive mode with which the user can sit at the terminal and manually pick the candidate replacement that he/she thinks makes the most sense. As before, there is a non-interactive mode that picks the first candidate that the script considers "valid." TODO: We should have a known_contributors file that stores known mappings so we don't have to go through all of this translation every time. This is also valuable because some contributors simply cannot be automatically translated. Conflicts: .gitignore
Diffstat (limited to 'dev/create-release/generate-contributors.py')
-rwxr-xr-xdev/create-release/generate-contributors.py52
1 files changed, 31 insertions, 21 deletions
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index 99c29ef9ff..a3b78a3eac 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -26,8 +26,6 @@ from releaseutils import *
# You must set the following before use!
JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
START_COMMIT = os.environ.get("START_COMMIT", "37b100")
END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
@@ -40,8 +38,6 @@ if not START_COMMIT or not END_COMMIT:
END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
# Verify provided arguments
-if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided")
-if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided")
start_commit_line = get_one_line(START_COMMIT)
end_commit_line = get_one_line(END_COMMIT)
num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
@@ -60,14 +56,6 @@ if response.lower() != "y" and response:
sys.exit("Ok, exiting")
print "==================================================================================\n"
-# Setup JIRA and github clients. We use two JIRA clients, one with authentication
-# and one without, because authentication is slow and required only when we query
-# JIRA user details but not Spark issues
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options)
-jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
-
# Find all commits within this range
print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
commits = get_one_line_commits(START_COMMIT, END_COMMIT)
@@ -105,13 +93,17 @@ if releases or reverts or nojiras:
if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
print "==================== Warning: the above commits will be ignored ==================\n"
-response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
-if response.lower() != "y":
+response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
+if response.lower() != "y" and response:
sys.exit("Ok, exiting.")
# Keep track of warnings to tell the user at the end
warnings = []
+# Mapping from the invalid author name to its associated JIRA issues
+# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
+invalid_authors = {}
+
# Populate a map that groups issues and components by author
# It takes the form: Author name -> { Contribution type -> Spark components }
# For instance,
@@ -127,16 +119,23 @@ warnings = []
# }
#
author_info = {}
+jira_options = { "server": JIRA_API_BASE }
+jira_client = JIRA(options = jira_options)
print "\n=========================== Compiling contributor list ==========================="
for commit in filtered_commits:
commit_hash = re.findall("^[a-z0-9]+", commit)[0]
issues = re.findall("SPARK-[0-9]+", commit.upper())
- # Translate the author in case the github username is not an actual name
- # Also guard against any special characters used in the name
- # Note the JIRA client we use here must have authentication enabled
author = get_author(commit_hash)
- author = unidecode.unidecode(unicode(author, "UTF-8"))
- author = translate_author(author, github_client, jira_client_auth, warnings)
+ author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
+ # If the author name is invalid, keep track of it along
+ # with all associated issues so we can translate it later
+ if is_valid_author(author):
+ author = capitalize_author(author)
+ else:
+ if author not in invalid_authors:
+ invalid_authors[author] = set()
+ for issue in issues:
+ invalid_authors[author].add(issue)
date = get_date(commit_hash)
# Parse components from the commit message, if any
commit_components = find_components(commit, commit_hash)
@@ -147,7 +146,7 @@ for commit in filtered_commits:
author_info[author] = {}
if issue_type not in author_info[author]:
author_info[author][issue_type] = set()
- for component in all_components:
+ for component in components:
author_info[author][issue_type].add(component)
# Find issues and components associated with this commit
for issue in issues:
@@ -168,7 +167,6 @@ print "=========================================================================
# Each line takes the format "Author name - semi-colon delimited contributions"
# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
# e.g. Tathagata Das - Bug fixes and new features in Streaming
-contributors_file_name = "contributors.txt"
contributors_file = open(contributors_file_name, "w")
authors = author_info.keys()
authors.sort()
@@ -192,11 +190,23 @@ for author in authors:
# Do not use python's capitalize() on the whole string to preserve case
assert contribution
contribution = contribution[0].capitalize() + contribution[1:]
+ # If the author name is invalid, use an intermediate format that
+ # can be translated through translate-contributors.py later
+ # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
+ if author in invalid_authors and invalid_authors[author]:
+ author = author + "/" + "/".join(invalid_authors[author])
line = "%s - %s" % (author, contribution)
contributors_file.write(line + "\n")
contributors_file.close()
print "Contributors list is successfully written to %s!" % contributors_file_name
+# Prompt the user to translate author names if necessary
+if invalid_authors:
+ warnings.append("Found the following invalid authors:")
+ for a in invalid_authors:
+ warnings.append("\t%s" % a)
+ warnings.append("Please run './translate-contributors.py' to translate them.")
+
# Log any warnings encountered in the process
if warnings:
print "\n============ Warnings encountered while creating the contributor list ============"