aboutsummaryrefslogtreecommitdiff
path: root/dev/create-release/releaseutils.py
diff options
context:
space:
mode:
authorAndrew Or <andrew@databricks.com>2014-12-16 17:55:27 -0800
committerAndrew Or <andrew@databricks.com>2014-12-16 18:05:46 -0800
commit0fb00473904ff3643b6f6848e0faa0deeb1d60f5 (patch)
treed98a1db6b868cfdc869831fc44771a3008960a78 /dev/create-release/releaseutils.py
parent1b6fc237c26d9fcb9d4afc9c93a21f9134231145 (diff)
downloadspark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.tar.gz
spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.tar.bz2
spark-0fb00473904ff3643b6f6848e0faa0deeb1d60f5.zip
[Release] Major improvements to generate contributors script
This commit introduces several major improvements to the script that generates the contributors list for release notes, notably: (1) Use release tags instead of a range of commits. Across branches, commits are not actually strictly two-dimensional, and so it is not sufficient to specify a start hash and an end hash. Otherwise, we end up counting commits that were already merged in an older branch. (2) Match PR numbers in addition to commit hashes. This is related to the first point in that if a PR is already merged in an older minor release tag, it should be filtered out here. This requires us to do some intelligent regex parsing on the commit description in addition to just relying on the GitHub API. (3) Relax author validity check. The old code fails on a name that has many middle names, for instance. The test was just too strict. (4) Use GitHub authentication. This allows us to make far more requests through the GitHub API than before (5000 as opposed to 60 per hour). (5) Translate from Github username, not commit author name. This is important because the commit author name is not always configured correctly by the user. For instance, the username "falaki" used to resolve to just "Hossein", which was treated as a github username and translated to something else that is completely arbitrary. (6) Add an option to use the untranslated name. If there is not a satisfactory candidate to replace the untranslated name with, at least allow the user to not translate it.
Diffstat (limited to 'dev/create-release/releaseutils.py')
-rwxr-xr-xdev/create-release/releaseutils.py94
1 files changed, 81 insertions, 13 deletions
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 76a10c3288..18e16bcb90 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -19,6 +19,7 @@
# This file contains helper methods used in creating a release.
import re
+import sys
from subprocess import Popen, PIPE
try:
@@ -47,20 +48,85 @@ except ImportError:
# Contributors list file name
contributors_file_name = "contributors.txt"
+# Prompt the user to answer yes or no until they do so
+def yesOrNoPrompt(msg):
+ response = raw_input("%s [y/n]: " % msg)
+ while response != "y" and response != "n":
+ return yesOrNoPrompt(msg)
+ return response == "y"
+
# Utility functions run git commands (written with Git 1.8.5)
def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
-def get_author(commit_hash):
- return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash])
+def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1]
def get_date(commit_hash):
return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
-def get_one_line(commit_hash):
- return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash])
-def get_one_line_commits(start_hash, end_hash):
- return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
-def num_commits_in_range(start_hash, end_hash):
- output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
- lines = [line for line in output.split("\n") if line] # filter out empty lines
- return len(lines)
+def tag_exists(tag):
+ stderr = run_cmd_error(["git", "checkout", tag])
+ return "error" not in stderr
+
+# A type-safe representation of a commit
+class Commit:
+ def __init__(self, _hash, author, title, pr_number = None):
+ self._hash = _hash
+ self.author = author
+ self.title = title
+ self.pr_number = pr_number
+ def get_hash(self): return self._hash
+ def get_author(self): return self.author
+ def get_title(self): return self.title
+ def get_pr_number(self): return self.pr_number
+ def __str__(self):
+ closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else ""
+ return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr)
+
+# Return all commits that belong to the specified tag.
+#
+# Under the hood, this runs a `git log` on that tag and parses the fields
+# from the command output to construct a list of Commit objects. Note that
+# because certain fields reside in the commit description and cannot be parsed
+# through the Github API itself, we need to do some intelligent regex parsing
+# to extract those fields.
+#
+# This is written using Git 1.8.5.
+def get_commits(tag):
+ commit_start_marker = "|=== COMMIT START MARKER ===|"
+ commit_end_marker = "|=== COMMIT END MARKER ===|"
+ field_end_marker = "|=== COMMIT FIELD END MARKER ===|"
+ log_format =\
+ commit_start_marker + "%h" +\
+ field_end_marker + "%an" +\
+ field_end_marker + "%s" +\
+ commit_end_marker + "%b"
+ output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag])
+ commits = []
+ raw_commits = [c for c in output.split(commit_start_marker) if c]
+ for commit in raw_commits:
+ if commit.count(commit_end_marker) != 1:
+ print "Commit end marker not found in commit: "
+ for line in commit.split("\n"): print line
+ sys.exit(1)
+ # Separate commit digest from the body
+ # From the digest we extract the hash, author and the title
+ # From the body, we extract the PR number and the github username
+ [commit_digest, commit_body] = commit.split(commit_end_marker)
+ if commit_digest.count(field_end_marker) != 2:
+ sys.exit("Unexpected format in commit: %s" % commit_digest)
+ [_hash, author, title] = commit_digest.split(field_end_marker)
+ # The PR number and github username is in the commit message
+ # itself and cannot be accessed through any Github API
+ pr_number = None
+ match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body)
+ if match:
+ [pr_number, github_username] = match.groups()
+ # If the author name is not valid, use the github
+ # username so we can translate it properly later
+ if not is_valid_author(author):
+ author = github_username
+ # Guard against special characters
+ author = unidecode.unidecode(unicode(author, "UTF-8")).strip()
+ commit = Commit(_hash, author, title, pr_number)
+ commits.append(commit)
+ return commits
# Maintain a mapping for translating issue types to contributions in the release notes
# This serves an additional function of warning the user against unknown issue types
@@ -70,10 +136,13 @@ def num_commits_in_range(start_hash, end_hash):
known_issue_types = {
"bug": "bug fixes",
"build": "build fixes",
+ "dependency upgrade": "build fixes",
"improvement": "improvements",
"new feature": "new features",
"documentation": "documentation",
- "test": "test"
+ "test": "test",
+ "task": "improvement",
+ "sub-task": "improvement"
}
# Maintain a mapping for translating component names when creating the release notes
@@ -176,8 +245,7 @@ def get_jira_name(author, jira_client):
# Return whether the given name is in the form <First Name><space><Last Name>
def is_valid_author(author):
if not author: return False
- author_words = len(author.split(" "))
- return author_words == 2 or author_words == 3
+ return " " in author and not re.findall("[0-9]", author)
# Capitalize the first letter of each word in the given author name
def capitalize_author(author):