aboutsummaryrefslogtreecommitdiff
path: root/ec2
diff options
context:
space:
mode:
authorFlorian Verhein <florian.verhein@gmail.com>2015-02-09 23:47:07 +0000
committerSean Owen <sowen@cloudera.com>2015-02-09 23:47:07 +0000
commitb884daa58084d4f42e2318894067565b94e07f9d (patch)
treef047764ae63bdb78f45baec499f0cd671b6cb819 /ec2
parentf48199eb354d6ec8675c2c1f96c3005064058d66 (diff)
downloadspark-b884daa58084d4f42e2318894067565b94e07f9d.tar.gz
spark-b884daa58084d4f42e2318894067565b94e07f9d.tar.bz2
spark-b884daa58084d4f42e2318894067565b94e07f9d.zip
[SPARK-5611] [EC2] Allow spark-ec2 repo and branch to be set on CLI of spark_ec2.py
and by extension, the ami-list Useful for using alternate spark-ec2 repos or branches. Author: Florian Verhein <florian.verhein@gmail.com> Closes #4385 from florianverhein/master and squashes the following commits: 7e2b4be [Florian Verhein] [SPARK-5611] [EC2] typo 8b653dc [Florian Verhein] [SPARK-5611] [EC2] Enforce only supporting spark-ec2 forks from github, log improvement bc4b0ed [Florian Verhein] [SPARK-5611] allow spark-ec2 repos with different names 8b5c551 [Florian Verhein] improve option naming, fix logging, fix lint failing, add guard to enforce spark-ec2 7724308 [Florian Verhein] [SPARK-5611] [EC2] fixes b42b68c [Florian Verhein] [SPARK-5611] [EC2] Allow spark-ec2 repo and branch to be set on CLI of spark_ec2.py
Diffstat (limited to 'ec2')
-rwxr-xr-xec2/spark_ec2.py37
1 files changed, 32 insertions, 5 deletions
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 87b2112fe4..3e4c49c0e1 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -62,10 +62,10 @@ VALID_SPARK_VERSIONS = set([
DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION
DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark"
-MESOS_SPARK_EC2_BRANCH = "branch-1.3"
-# A URL prefix from which to fetch AMI information
-AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
+# Default location to get the spark-ec2 scripts (and ami-list) from
+DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/mesos/spark-ec2"
+DEFAULT_SPARK_EC2_BRANCH = "branch-1.3"
def setup_boto():
@@ -148,6 +148,14 @@ def parse_args():
default=DEFAULT_SPARK_GITHUB_REPO,
help="Github repo from which to checkout supplied commit hash (default: %default)")
parser.add_option(
+ "--spark-ec2-git-repo",
+ default=DEFAULT_SPARK_EC2_GITHUB_REPO,
+ help="Github repo from which to checkout spark-ec2 (default: %default)")
+ parser.add_option(
+ "--spark-ec2-git-branch",
+ default=DEFAULT_SPARK_EC2_BRANCH,
+ help="Github repo branch of spark-ec2 to use (default: %default)")
+ parser.add_option(
"--hadoop-major-version", default="1",
help="Major version of Hadoop (default: %default)")
parser.add_option(
@@ -333,7 +341,12 @@ def get_spark_ami(opts):
print >> stderr,\
"Don't recognize %s, assuming type is pvm" % opts.instance_type
- ami_path = "%s/%s/%s" % (AMI_PREFIX, opts.region, instance_type)
+ # URL prefix from which to fetch AMI information
+ ami_prefix = "{r}/{b}/ami-list".format(
+ r=opts.spark_ec2_git_repo.replace("https://github.com", "https://raw.github.com", 1),
+ b=opts.spark_ec2_git_branch)
+
+ ami_path = "%s/%s/%s" % (ami_prefix, opts.region, instance_type)
try:
ami = urllib2.urlopen(ami_path).read().strip()
print "Spark AMI: " + ami
@@ -650,12 +663,15 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
# NOTE: We should clone the repository before running deploy_files to
# prevent ec2-variables.sh from being overwritten
+ print "Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format(
+ r=opts.spark_ec2_git_repo, b=opts.spark_ec2_git_branch)
ssh(
host=master,
opts=opts,
command="rm -rf spark-ec2"
+ " && "
- + "git clone https://github.com/mesos/spark-ec2.git -b {b}".format(b=MESOS_SPARK_EC2_BRANCH)
+ + "git clone {r} -b {b} spark-ec2".format(r=opts.spark_ec2_git_repo,
+ b=opts.spark_ec2_git_branch)
)
print "Deploying files to master..."
@@ -1038,6 +1054,17 @@ def real_main():
print >> stderr, "ebs-vol-num cannot be greater than 8"
sys.exit(1)
+ # Prevent breaking ami_prefix (/, .git and startswith checks)
+ # Prevent forks with non spark-ec2 names for now.
+ if opts.spark_ec2_git_repo.endswith("/") or \
+ opts.spark_ec2_git_repo.endswith(".git") or \
+ not opts.spark_ec2_git_repo.startswith("https://github.com") or \
+ not opts.spark_ec2_git_repo.endswith("spark-ec2"):
+ print >> stderr, "spark-ec2-git-repo must be a github repo and it must not have a " \
+ "trailing / or .git. " \
+ "Furthermore, we currently only support forks named spark-ec2."
+ sys.exit(1)
+
try:
conn = ec2.connect_to_region(opts.region)
except Exception as e: