[Release] Automate generation of contributors list

This commit provides a script that computes the contributors list by linking the github commits with JIRA issues. Automatically translating github usernames remains a TODO at this point.
author: Andrew Or <andrew@databricks.com> 2014-11-26 23:16:23 -0800
committer: Andrew Or <andrew@databricks.com> 2014-11-26 23:19:10 -0800
commit: a0aa07baaab10fe6e491a06171fe42e0f102c7a6 (patch)
tree: 412d707d7f04c29ea03dfe80671a045636e7ca1a
parent: 66cc2431462a5354bb50c196a59da0ffc258c466 (diff)
download: spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.tar.gz
spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.tar.bz2
spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.zip
2 files changed, 330 insertions, 0 deletions
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
new file mode 100755
index 0000000000..f4bf734081
--- /dev/null
+++ b/dev/create-release/generate-contributors.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script automates the process of creating release notes.
+
+import os
+import re
+import sys
+
+from releaseutils import *
+
+# You must set the following before use!
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+START_COMMIT = os.environ.get("START_COMMIT", "37b100")
+END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
+
+try:
+    from jira.client import JIRA
+except ImportError:
+    print "This tool requires the jira-python library"
+    print "Install using 'sudo pip install jira-python'"
+    sys.exit(-1)
+
+try:
+    import unidecode
+except ImportError:
+    print "This tool requires the unidecode library to decode obscure github usernames"
+    print "Install using 'sudo pip install unidecode'"
+    sys.exit(-1)
+
+# If commit range is not specified, prompt the user to provide it
+if not START_COMMIT or not END_COMMIT:
+    print "A commit range is required to proceed."
+    if not START_COMMIT:
+        START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ")
+    if not END_COMMIT:
+        END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
+
+# Verify provided arguments
+start_commit_line = get_one_line(START_COMMIT)
+end_commit_line = get_one_line(END_COMMIT)
+num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
+if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT)
+if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT)
+if num_commits == 0:
+    sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT))
+print "\n=================================================================================="
+print "JIRA server: %s" % JIRA_API_BASE
+print "Start commit (inclusive): %s" % start_commit_line
+print "End commit (non-inclusive): %s" % end_commit_line
+print "Number of commits in this range: %s" % num_commits
+print
+response = raw_input("Is this correct? [Y/n] ")
+if response.lower() != "y" and response:
+    sys.exit("Ok, exiting")
+print "==================================================================================\n"
+
+# Find all commits within this range
+print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
+commits = get_one_line_commits(START_COMMIT, END_COMMIT)
+if not commits: sys.exit("Error: No commits found within this range!")
+commits = commits.split("\n")
+
+# Filter out special commits
+releases = []
+reverts = []
+nojiras = []
+filtered_commits = []
+def is_release(commit):
+    return re.findall("\[release\]", commit.lower()) or\
+        "maven-release-plugin" in commit or "CHANGES.txt" in commit
+def has_no_jira(commit):
+    return not re.findall("SPARK-[0-9]+", commit.upper())
+def is_revert(commit):
+    return "revert" in commit.lower()
+def is_docs(commit):
+    return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower()
+for c in commits:
+    if not c: continue
+    elif is_release(c): releases.append(c)
+    elif is_revert(c): reverts.append(c)
+    elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers
+    elif has_no_jira(c): nojiras.append(c)
+    else: filtered_commits.append(c)
+
+# Warn against ignored commits
+def print_indented(_list):
+    for x in _list: print "  %s" % x
+if releases or reverts or nojiras:
+    print "\n=================================================================================="
+    if releases: print "Releases (%d)" % len(releases); print_indented(releases)
+    if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
+    if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
+    print "==================== Warning: the above commits will be ignored ==================\n"
+response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
+if response.lower() != "y":
+    sys.exit("Ok, exiting.")
+
+# Keep track of warnings to tell the user at the end
+warnings = []
+
+# Populate a map that groups issues and components by author
+# It takes the form: Author name -> { Contribution type -> Spark components }
+# For instance,
+# {
+#   'Andrew Or': {
+#     'bug fixes': ['windows', 'core', 'web ui'],
+#     'improvements': ['core']
+#   },
+#   'Tathagata Das' : {
+#     'bug fixes': ['streaming']
+#     'new feature': ['streaming']
+#   }
+# }
+#
+author_info = {}
+jira_options = { "server": JIRA_API_BASE }
+jira = JIRA(jira_options)
+print "\n=========================== Compiling contributor list ==========================="
+for commit in filtered_commits:
+    commit_hash = re.findall("^[a-z0-9]+", commit)[0]
+    issues = re.findall("SPARK-[0-9]+", commit.upper())
+    author = get_author(commit_hash)
+    author = unidecode.unidecode(unicode(author, "UTF-8")) # guard against special characters
+    date = get_date(commit_hash)
+    # Parse components from the commit message, if any
+    commit_components = find_components(commit, commit_hash)
+    # Populate or merge an issue into author_info[author]
+    def populate(issue_type, components):
+        components = components or [CORE_COMPONENT] # assume core if no components provided
+        if author not in author_info:
+            author_info[author] = {}
+        if issue_type not in author_info[author]:
+            author_info[author][issue_type] = set()
+        for component in all_components:
+            author_info[author][issue_type].add(component)
+    # Find issues and components associated with this commit
+    for issue in issues:
+        jira_issue = jira.issue(issue)
+        jira_type = jira_issue.fields.issuetype.name
+        jira_type = translate_issue_type(jira_type, issue, warnings)
+        jira_components = [translate_component(c.name, commit_hash, warnings)\
+            for c in jira_issue.fields.components]
+        all_components = set(jira_components + commit_components)
+        populate(jira_type, all_components)
+    # For docs without an associated JIRA, manually add it ourselves
+    if is_docs(commit) and not issues:
+        populate("documentation", commit_components)
+    print "  Processed commit %s authored by %s on %s" % (commit_hash, author, date)
+print "==================================================================================\n"
+
+# Write to contributors file ordered by author names
+# Each line takes the format "Author name - semi-colon delimited contributions"
+# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
+# e.g. Tathagata Das - Bug fixes and new features in Streaming
+contributors_file_name = "contributors.txt"
+contributors_file = open(contributors_file_name, "w")
+authors = author_info.keys()
+authors.sort()
+for author in authors:
+    contribution = ""
+    components = set()
+    issue_types = set()
+    for issue_type, comps in author_info[author].items():
+        components.update(comps)
+        issue_types.add(issue_type)
+    # If there is only one component, mention it only once
+    # e.g. Bug fixes, improvements in MLlib
+    if len(components) == 1:
+        contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
+    # Otherwise, group contributions by issue types instead of modules
+    # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
+    else:
+        contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
+            for issue_type, comps in author_info[author].items()]
+        contribution = "; ".join(contributions)
+    # Do not use python's capitalize() on the whole string to preserve case
+    assert contribution
+    contribution = contribution[0].capitalize() + contribution[1:]
+    line = "%s - %s" % (author, contribution)
+    contributors_file.write(line + "\n")
+contributors_file.close()
+print "Contributors list is successfully written to %s!" % contributors_file_name
+
+# Log any warnings encountered in the process
+if warnings:
+    print "\n============ Warnings encountered while creating the contributor list ============"
+    for w in warnings: print w
+    print "Please correct these in the final contributors list at %s." % contributors_file_name
+    print "==================================================================================\n"
+
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
new file mode 100755
index 0000000000..e56d7fa58f
--- /dev/null
+++ b/dev/create-release/releaseutils.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file contains helper methods used in creating a release.
+
+import re
+from subprocess import Popen, PIPE
+
+# Utility functions run git commands (written with Git 1.8.5)
+def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
+def get_author(commit_hash):
+    return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash])
+def get_date(commit_hash):
+    return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
+def get_one_line(commit_hash):
+    return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash])
+def get_one_line_commits(start_hash, end_hash):
+    return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
+def num_commits_in_range(start_hash, end_hash):
+    output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
+    lines = [line for line in output.split("\n") if line] # filter out empty lines
+    return len(lines)
+
+# Maintain a mapping for translating issue types to contributions in the release notes
+# This serves an additional function of warning the user against unknown issue types
+# Note: This list is partially derived from this link:
+# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes
+# Keep these in lower case
+known_issue_types = {
+    "bug": "bug fixes",
+    "build": "build fixes",
+    "improvement": "improvements",
+    "new feature": "new features",
+    "documentation": "documentation"
+}
+
+# Maintain a mapping for translating component names when creating the release notes
+# This serves an additional function of warning the user against unknown components
+# Note: This list is largely derived from this link:
+# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components
+CORE_COMPONENT = "Core"
+known_components = {
+    "block manager": CORE_COMPONENT,
+    "build": CORE_COMPONENT,
+    "deploy": CORE_COMPONENT,
+    "documentation": CORE_COMPONENT,
+    "ec2": "EC2",
+    "examples": CORE_COMPONENT,
+    "graphx": "GraphX",
+    "input/output": CORE_COMPONENT,
+    "java api": "Java API",
+    "mesos": "Mesos",
+    "ml": "MLlib",
+    "mllib": "MLlib",
+    "project infra": "Project Infra",
+    "pyspark": "PySpark",
+    "shuffle": "Shuffle",
+    "spark core": CORE_COMPONENT,
+    "spark shell": CORE_COMPONENT,
+    "sql": "SQL",
+    "streaming": "Streaming",
+    "web ui": "Web UI",
+    "windows": "Windows",
+    "yarn": "YARN"
+}
+
+# Translate issue types using a format appropriate for writing contributions
+# If an unknown issue type is encountered, warn the user
+def translate_issue_type(issue_type, issue_id, warnings):
+    issue_type = issue_type.lower()
+    if issue_type in known_issue_types:
+        return known_issue_types[issue_type]
+    else:
+        warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id))
+        return issue_type
+
+# Translate component names using a format appropriate for writing contributions
+# If an unknown component is encountered, warn the user
+def translate_component(component, commit_hash, warnings):
+    component = component.lower()
+    if component in known_components:
+        return known_components[component]
+    else:
+        warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash))
+        return component
+
+# Parse components in the commit message
+# The returned components are already filtered and translated
+def find_components(commit, commit_hash):
+    components = re.findall("\[\w*\]", commit.lower())
+    components = [translate_component(c, commit_hash)\
+        for c in components if c in known_components]
+    return components
+
+# Join a list of strings in a human-readable manner
+# e.g. ["Juice"] -> "Juice"
+# e.g. ["Juice", "baby"] -> "Juice and baby"
+# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon"
+def nice_join(str_list):
+    str_list = list(str_list) # sometimes it's a set
+    if not str_list:
+        return ""
+    elif len(str_list) == 1:
+        return next(iter(str_list))
+    elif len(str_list) == 2:
+        return " and ".join(str_list)
+    else:
+        return ", ".join(str_list[:-1]) + ", and " + str_list[-1]
+
author	Andrew Or <andrew@databricks.com>	2014-11-26 23:16:23 -0800
committer	Andrew Or <andrew@databricks.com>	2014-11-26 23:19:10 -0800
commit	a0aa07baaab10fe6e491a06171fe42e0f102c7a6 (patch)
tree	412d707d7f04c29ea03dfe80671a045636e7ca1a
parent	66cc2431462a5354bb50c196a59da0ffc258c466 (diff)
download	spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.tar.gz spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.tar.bz2 spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.zip