[Release] Correctly translate contributors name in release notes

This commit involves three main changes: (1) It separates the translation of contributor names from the generation of the contributors list. This is largely motivated by the Github API limit; even if we exceed this limit, we should at least be able to proceed manually as before. This is why the translation logic is abstracted into its own script translate-contributors.py. (2) When we look for candidate replacements for invalid author names, we should look for the assignees of the associated JIRAs too. As a result, the intermediate file must keep track of these. (3) This provides an interactive mode with which the user can sit at the terminal and manually pick the candidate replacement that he/she thinks makes the most sense. As before, there is a non-interactive mode that picks the first candidate that the script considers "valid." TODO: We should have a known_contributors file that stores known mappings so we don't have to go through all of this translation every time. This is also valuable because some contributors simply cannot be automatically translated. Conflicts: .gitignore
author: Andrew Or <andrew@databricks.com> 2014-12-03 19:08:29 -0800
committer: Andrew Or <andrew@databricks.com> 2014-12-03 19:19:29 -0800
commit: f9e1f89b2500287ff284317fe4504bd32d3b8e1a (patch)
tree: f3b5041fcca226b23c880e2660b3c97a6189c20b /dev/create-release/generate-contributors.py
parent: 9880bb481943b45cb5ad981809cf5cbd7b0639bb (diff)
download: spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.tar.gz
spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.tar.bz2
spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.zip
1 files changed, 31 insertions, 21 deletions
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
index 99c29ef9ff..a3b78a3eac 100755
--- a/dev/create-release/generate-contributors.py
+++ b/dev/create-release/generate-contributors.py
@@ -26,8 +26,6 @@ from releaseutils import *
 
 # You must set the following before use!
 JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
-JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None)
-JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None)
 START_COMMIT = os.environ.get("START_COMMIT", "37b100")
 END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
 
@@ -40,8 +38,6 @@ if not START_COMMIT or not END_COMMIT:
         END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
 
 # Verify provided arguments
-if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided")
-if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided")
 start_commit_line = get_one_line(START_COMMIT)
 end_commit_line = get_one_line(END_COMMIT)
 num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
@@ -60,14 +56,6 @@ if response.lower() != "y" and response:
     sys.exit("Ok, exiting")
 print "==================================================================================\n"
 
-# Setup JIRA and github clients. We use two JIRA clients, one with authentication
-# and one without, because authentication is slow and required only when we query
-# JIRA user details but not Spark issues
-jira_options = { "server": JIRA_API_BASE }
-jira_client = JIRA(options = jira_options)
-jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD))
-github_client = Github()
-
 # Find all commits within this range
 print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
 commits = get_one_line_commits(START_COMMIT, END_COMMIT)
@@ -105,13 +93,17 @@ if releases or reverts or nojiras:
     if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
     if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
     print "==================== Warning: the above commits will be ignored ==================\n"
-response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
-if response.lower() != "y":
+response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits))
+if response.lower() != "y" and response:
     sys.exit("Ok, exiting.")
 
 # Keep track of warnings to tell the user at the end
 warnings = []
 
+# Mapping from the invalid author name to its associated JIRA issues
+# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471")
+invalid_authors = {}
+
 # Populate a map that groups issues and components by author
 # It takes the form: Author name -> { Contribution type -> Spark components }
 # For instance,
@@ -127,16 +119,23 @@ warnings = []
 # }
 #
 author_info = {}
+jira_options = { "server": JIRA_API_BASE }
+jira_client = JIRA(options = jira_options)
 print "\n=========================== Compiling contributor list ==========================="
 for commit in filtered_commits:
     commit_hash = re.findall("^[a-z0-9]+", commit)[0]
     issues = re.findall("SPARK-[0-9]+", commit.upper())
-    # Translate the author in case the github username is not an actual name
-    # Also guard against any special characters used in the name
-    # Note the JIRA client we use here must have authentication enabled
     author = get_author(commit_hash)
-    author = unidecode.unidecode(unicode(author, "UTF-8"))
-    author = translate_author(author, github_client, jira_client_auth, warnings)
+    author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters
+    # If the author name is invalid, keep track of it along
+    # with all associated issues so we can translate it later
+    if is_valid_author(author):
+        author = capitalize_author(author)
+    else:
+        if author not in invalid_authors:
+            invalid_authors[author] = set()
+        for issue in issues:
+            invalid_authors[author].add(issue)
     date = get_date(commit_hash)
     # Parse components from the commit message, if any
     commit_components = find_components(commit, commit_hash)
@@ -147,7 +146,7 @@ for commit in filtered_commits:
             author_info[author] = {}
         if issue_type not in author_info[author]:
             author_info[author][issue_type] = set()
-        for component in all_components:
+        for component in components:
             author_info[author][issue_type].add(component)
     # Find issues and components associated with this commit
     for issue in issues:
@@ -168,7 +167,6 @@ print "=========================================================================
 # Each line takes the format "Author name - semi-colon delimited contributions"
 # e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
 # e.g. Tathagata Das - Bug fixes and new features in Streaming
-contributors_file_name = "contributors.txt"
 contributors_file = open(contributors_file_name, "w")
 authors = author_info.keys()
 authors.sort()
@@ -192,11 +190,23 @@ for author in authors:
     # Do not use python's capitalize() on the whole string to preserve case
     assert contribution
     contribution = contribution[0].capitalize() + contribution[1:]
+    # If the author name is invalid, use an intermediate format that
+    # can be translated through translate-contributors.py later
+    # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672
+    if author in invalid_authors and invalid_authors[author]:
+        author = author + "/" + "/".join(invalid_authors[author])
     line = "%s - %s" % (author, contribution)
     contributors_file.write(line + "\n")
 contributors_file.close()
 print "Contributors list is successfully written to %s!" % contributors_file_name
 
+# Prompt the user to translate author names if necessary
+if invalid_authors:
+    warnings.append("Found the following invalid authors:")
+    for a in invalid_authors:
+        warnings.append("\t%s" % a)
+    warnings.append("Please run './translate-contributors.py' to translate them.")
+
 # Log any warnings encountered in the process
 if warnings:
     print "\n============ Warnings encountered while creating the contributor list ============"
author	Andrew Or <andrew@databricks.com>	2014-12-03 19:08:29 -0800
committer	Andrew Or <andrew@databricks.com>	2014-12-03 19:19:29 -0800
commit	f9e1f89b2500287ff284317fe4504bd32d3b8e1a (patch)
tree	f3b5041fcca226b23c880e2660b3c97a6189c20b /dev/create-release/generate-contributors.py
parent	9880bb481943b45cb5ad981809cf5cbd7b0639bb (diff)
download	spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.tar.gz spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.tar.bz2 spark-f9e1f89b2500287ff284317fe4504bd32d3b8e1a.zip