aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Or <andrew@databricks.com>2014-11-26 23:16:23 -0800
committerAndrew Or <andrew@databricks.com>2014-11-26 23:19:10 -0800
commita0aa07baaab10fe6e491a06171fe42e0f102c7a6 (patch)
tree412d707d7f04c29ea03dfe80671a045636e7ca1a
parent66cc2431462a5354bb50c196a59da0ffc258c466 (diff)
downloadspark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.tar.gz
spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.tar.bz2
spark-a0aa07baaab10fe6e491a06171fe42e0f102c7a6.zip
[Release] Automate generation of contributors list
This commit provides a script that computes the contributors list by linking the github commits with JIRA issues. Automatically translating github usernames remains a TODO at this point.
-rwxr-xr-xdev/create-release/generate-contributors.py206
-rwxr-xr-xdev/create-release/releaseutils.py124
2 files changed, 330 insertions, 0 deletions
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
new file mode 100755
index 0000000000..f4bf734081
--- /dev/null
+++ b/dev/create-release/generate-contributors.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script automates the process of creating release notes.
+
+import os
+import re
+import sys
+
+from releaseutils import *
+
+# You must set the following before use!
+JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira")
+START_COMMIT = os.environ.get("START_COMMIT", "37b100")
+END_COMMIT = os.environ.get("END_COMMIT", "3693ae")
+
+try:
+ from jira.client import JIRA
+except ImportError:
+ print "This tool requires the jira-python library"
+ print "Install using 'sudo pip install jira-python'"
+ sys.exit(-1)
+
+try:
+ import unidecode
+except ImportError:
+ print "This tool requires the unidecode library to decode obscure github usernames"
+ print "Install using 'sudo pip install unidecode'"
+ sys.exit(-1)
+
+# If commit range is not specified, prompt the user to provide it
+if not START_COMMIT or not END_COMMIT:
+ print "A commit range is required to proceed."
+ if not START_COMMIT:
+ START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ")
+ if not END_COMMIT:
+ END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ")
+
+# Verify provided arguments
+start_commit_line = get_one_line(START_COMMIT)
+end_commit_line = get_one_line(END_COMMIT)
+num_commits = num_commits_in_range(START_COMMIT, END_COMMIT)
+if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT)
+if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT)
+if num_commits == 0:
+ sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT))
+print "\n=================================================================================="
+print "JIRA server: %s" % JIRA_API_BASE
+print "Start commit (inclusive): %s" % start_commit_line
+print "End commit (non-inclusive): %s" % end_commit_line
+print "Number of commits in this range: %s" % num_commits
+print
+response = raw_input("Is this correct? [Y/n] ")
+if response.lower() != "y" and response:
+ sys.exit("Ok, exiting")
+print "==================================================================================\n"
+
+# Find all commits within this range
+print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT)
+commits = get_one_line_commits(START_COMMIT, END_COMMIT)
+if not commits: sys.exit("Error: No commits found within this range!")
+commits = commits.split("\n")
+
+# Filter out special commits
+releases = []
+reverts = []
+nojiras = []
+filtered_commits = []
+def is_release(commit):
+ return re.findall("\[release\]", commit.lower()) or\
+ "maven-release-plugin" in commit or "CHANGES.txt" in commit
+def has_no_jira(commit):
+ return not re.findall("SPARK-[0-9]+", commit.upper())
+def is_revert(commit):
+ return "revert" in commit.lower()
+def is_docs(commit):
+ return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower()
+for c in commits:
+ if not c: continue
+ elif is_release(c): releases.append(c)
+ elif is_revert(c): reverts.append(c)
+ elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers
+ elif has_no_jira(c): nojiras.append(c)
+ else: filtered_commits.append(c)
+
+# Warn against ignored commits
+def print_indented(_list):
+ for x in _list: print " %s" % x
+if releases or reverts or nojiras:
+ print "\n=================================================================================="
+ if releases: print "Releases (%d)" % len(releases); print_indented(releases)
+ if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts)
+ if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras)
+ print "==================== Warning: the above commits will be ignored ==================\n"
+response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits))
+if response.lower() != "y":
+ sys.exit("Ok, exiting.")
+
+# Keep track of warnings to tell the user at the end
+warnings = []
+
+# Populate a map that groups issues and components by author
+# It takes the form: Author name -> { Contribution type -> Spark components }
+# For instance,
+# {
+# 'Andrew Or': {
+# 'bug fixes': ['windows', 'core', 'web ui'],
+# 'improvements': ['core']
+# },
+# 'Tathagata Das' : {
+# 'bug fixes': ['streaming']
+# 'new feature': ['streaming']
+# }
+# }
+#
+author_info = {}
+jira_options = { "server": JIRA_API_BASE }
+jira = JIRA(jira_options)
+print "\n=========================== Compiling contributor list ==========================="
+for commit in filtered_commits:
+ commit_hash = re.findall("^[a-z0-9]+", commit)[0]
+ issues = re.findall("SPARK-[0-9]+", commit.upper())
+ author = get_author(commit_hash)
+ author = unidecode.unidecode(unicode(author, "UTF-8")) # guard against special characters
+ date = get_date(commit_hash)
+ # Parse components from the commit message, if any
+ commit_components = find_components(commit, commit_hash)
+ # Populate or merge an issue into author_info[author]
+ def populate(issue_type, components):
+ components = components or [CORE_COMPONENT] # assume core if no components provided
+ if author not in author_info:
+ author_info[author] = {}
+ if issue_type not in author_info[author]:
+ author_info[author][issue_type] = set()
+ for component in all_components:
+ author_info[author][issue_type].add(component)
+ # Find issues and components associated with this commit
+ for issue in issues:
+ jira_issue = jira.issue(issue)
+ jira_type = jira_issue.fields.issuetype.name
+ jira_type = translate_issue_type(jira_type, issue, warnings)
+ jira_components = [translate_component(c.name, commit_hash, warnings)\
+ for c in jira_issue.fields.components]
+ all_components = set(jira_components + commit_components)
+ populate(jira_type, all_components)
+ # For docs without an associated JIRA, manually add it ourselves
+ if is_docs(commit) and not issues:
+ populate("documentation", commit_components)
+ print " Processed commit %s authored by %s on %s" % (commit_hash, author, date)
+print "==================================================================================\n"
+
+# Write to contributors file ordered by author names
+# Each line takes the format "Author name - semi-colon delimited contributions"
+# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core
+# e.g. Tathagata Das - Bug fixes and new features in Streaming
+contributors_file_name = "contributors.txt"
+contributors_file = open(contributors_file_name, "w")
+authors = author_info.keys()
+authors.sort()
+for author in authors:
+ contribution = ""
+ components = set()
+ issue_types = set()
+ for issue_type, comps in author_info[author].items():
+ components.update(comps)
+ issue_types.add(issue_type)
+ # If there is only one component, mention it only once
+ # e.g. Bug fixes, improvements in MLlib
+ if len(components) == 1:
+ contribution = "%s in %s" % (nice_join(issue_types), next(iter(components)))
+ # Otherwise, group contributions by issue types instead of modules
+ # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN
+ else:
+ contributions = ["%s in %s" % (issue_type, nice_join(comps)) \
+ for issue_type, comps in author_info[author].items()]
+ contribution = "; ".join(contributions)
+ # Do not use python's capitalize() on the whole string to preserve case
+ assert contribution
+ contribution = contribution[0].capitalize() + contribution[1:]
+ line = "%s - %s" % (author, contribution)
+ contributors_file.write(line + "\n")
+contributors_file.close()
+print "Contributors list is successfully written to %s!" % contributors_file_name
+
+# Log any warnings encountered in the process
+if warnings:
+ print "\n============ Warnings encountered while creating the contributor list ============"
+ for w in warnings: print w
+ print "Please correct these in the final contributors list at %s." % contributors_file_name
+ print "==================================================================================\n"
+
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
new file mode 100755
index 0000000000..e56d7fa58f
--- /dev/null
+++ b/dev/create-release/releaseutils.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file contains helper methods used in creating a release.
+
+import re
+from subprocess import Popen, PIPE
+
+# Utility functions run git commands (written with Git 1.8.5)
+def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0]
+def get_author(commit_hash):
+ return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash])
+def get_date(commit_hash):
+ return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash])
+def get_one_line(commit_hash):
+ return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash])
+def get_one_line_commits(start_hash, end_hash):
+ return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
+def num_commits_in_range(start_hash, end_hash):
+ output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)])
+ lines = [line for line in output.split("\n") if line] # filter out empty lines
+ return len(lines)
+
+# Maintain a mapping for translating issue types to contributions in the release notes
+# This serves an additional function of warning the user against unknown issue types
+# Note: This list is partially derived from this link:
+# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes
+# Keep these in lower case
+known_issue_types = {
+ "bug": "bug fixes",
+ "build": "build fixes",
+ "improvement": "improvements",
+ "new feature": "new features",
+ "documentation": "documentation"
+}
+
+# Maintain a mapping for translating component names when creating the release notes
+# This serves an additional function of warning the user against unknown components
+# Note: This list is largely derived from this link:
+# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components
+CORE_COMPONENT = "Core"
+known_components = {
+ "block manager": CORE_COMPONENT,
+ "build": CORE_COMPONENT,
+ "deploy": CORE_COMPONENT,
+ "documentation": CORE_COMPONENT,
+ "ec2": "EC2",
+ "examples": CORE_COMPONENT,
+ "graphx": "GraphX",
+ "input/output": CORE_COMPONENT,
+ "java api": "Java API",
+ "mesos": "Mesos",
+ "ml": "MLlib",
+ "mllib": "MLlib",
+ "project infra": "Project Infra",
+ "pyspark": "PySpark",
+ "shuffle": "Shuffle",
+ "spark core": CORE_COMPONENT,
+ "spark shell": CORE_COMPONENT,
+ "sql": "SQL",
+ "streaming": "Streaming",
+ "web ui": "Web UI",
+ "windows": "Windows",
+ "yarn": "YARN"
+}
+
+# Translate issue types using a format appropriate for writing contributions
+# If an unknown issue type is encountered, warn the user
+def translate_issue_type(issue_type, issue_id, warnings):
+ issue_type = issue_type.lower()
+ if issue_type in known_issue_types:
+ return known_issue_types[issue_type]
+ else:
+ warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id))
+ return issue_type
+
+# Translate component names using a format appropriate for writing contributions
+# If an unknown component is encountered, warn the user
+def translate_component(component, commit_hash, warnings):
+ component = component.lower()
+ if component in known_components:
+ return known_components[component]
+ else:
+ warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash))
+ return component
+
+# Parse components in the commit message
+# The returned components are already filtered and translated
+def find_components(commit, commit_hash):
+ components = re.findall("\[\w*\]", commit.lower())
+ components = [translate_component(c, commit_hash)\
+ for c in components if c in known_components]
+ return components
+
+# Join a list of strings in a human-readable manner
+# e.g. ["Juice"] -> "Juice"
+# e.g. ["Juice", "baby"] -> "Juice and baby"
+# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon"
+def nice_join(str_list):
+ str_list = list(str_list) # sometimes it's a set
+ if not str_list:
+ return ""
+ elif len(str_list) == 1:
+ return next(iter(str_list))
+ elif len(str_list) == 2:
+ return " and ".join(str_list)
+ else:
+ return ", ".join(str_list[:-1]) + ", and " + str_list[-1]
+