aboutsummaryrefslogtreecommitdiff
path: root/ec2/spark_ec2.py
diff options
context:
space:
mode:
authorKay Ousterhout <kayousterhout@gmail.com>2013-11-13 14:38:44 -0800
committerKay Ousterhout <kayousterhout@gmail.com>2013-11-13 14:38:44 -0800
commit150615a31e0c2a5112c37cca62dd80dba8a12fab (patch)
tree6ed706d3f5a5bc9f55fd12bba7425df950ed11df /ec2/spark_ec2.py
parent68e5ad58b7e7e3e1b42852de8d0fdf9e9b9c1a14 (diff)
parent39af914b273e35ff431844951ee8dfadcbc0c400 (diff)
downloadspark-150615a31e0c2a5112c37cca62dd80dba8a12fab.tar.gz
spark-150615a31e0c2a5112c37cca62dd80dba8a12fab.tar.bz2
spark-150615a31e0c2a5112c37cca62dd80dba8a12fab.zip
Merge remote-tracking branch 'upstream/master' into consolidate_schedulers
Conflicts: core/src/main/scala/org/apache/spark/scheduler/ClusterScheduler.scala
Diffstat (limited to 'ec2/spark_ec2.py')
-rwxr-xr-xec2/spark_ec2.py68
1 files changed, 41 insertions, 27 deletions
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 65868b76b9..1189232428 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -72,12 +72,12 @@ def parse_args():
parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use")
parser.add_option("-v", "--spark-version", default="0.8.0",
help="Version of Spark to use: 'X.Y.Z' or a specific git hash")
- parser.add_option("--spark-git-repo",
- default="https://github.com/mesos/spark",
+ parser.add_option("--spark-git-repo",
+ default="https://github.com/apache/incubator-spark",
help="Github repo from which to checkout supplied commit hash")
parser.add_option("--hadoop-major-version", default="1",
help="Major version of Hadoop (default: 1)")
- parser.add_option("-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
+ parser.add_option("-D", metavar="[ADDRESS:]PORT", dest="proxy_port",
help="Use SSH dynamic port forwarding to create a SOCKS proxy at " +
"the given local address (for use with login)")
parser.add_option("--resume", action="store_true", default=False,
@@ -101,6 +101,8 @@ def parse_args():
help="The SSH user you want to connect as (default: root)")
parser.add_option("--delete-groups", action="store_true", default=False,
help="When destroying a cluster, delete the security groups that were created")
+ parser.add_option("--use-existing-master", action="store_true", default=False,
+ help="Launch fresh slaves, but use an existing stopped master if possible")
(opts, args) = parser.parse_args()
if len(args) != 2:
@@ -191,7 +193,7 @@ def get_spark_ami(opts):
instance_type = "pvm"
print >> stderr,\
"Don't recognize %s, assuming type is pvm" % opts.instance_type
-
+
ami_path = "%s/%s/%s" % (AMI_PREFIX, opts.region, instance_type)
try:
ami = urllib2.urlopen(ami_path).read().strip()
@@ -215,6 +217,7 @@ def launch_cluster(conn, opts, cluster_name):
master_group.authorize(src_group=slave_group)
master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
+ master_group.authorize('tcp', 19999, 19999, '0.0.0.0/0')
master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0')
master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0')
@@ -232,9 +235,9 @@ def launch_cluster(conn, opts, cluster_name):
slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
# Check if instances are already running in our groups
- active_nodes = get_existing_cluster(conn, opts, cluster_name,
- die_on_error=False)
- if any(active_nodes):
+ existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
+ die_on_error=False)
+ if existing_slaves or (existing_masters and not opts.use_existing_master):
print >> stderr, ("ERROR: There are already instances running in " +
"group %s or %s" % (master_group.name, slave_group.name))
sys.exit(1)
@@ -335,21 +338,28 @@ def launch_cluster(conn, opts, cluster_name):
zone, slave_res.id)
i += 1
- # Launch masters
- master_type = opts.master_instance_type
- if master_type == "":
- master_type = opts.instance_type
- if opts.zone == 'all':
- opts.zone = random.choice(conn.get_all_zones()).name
- master_res = image.run(key_name = opts.key_pair,
- security_groups = [master_group],
- instance_type = master_type,
- placement = opts.zone,
- min_count = 1,
- max_count = 1,
- block_device_map = block_map)
- master_nodes = master_res.instances
- print "Launched master in %s, regid = %s" % (zone, master_res.id)
+ # Launch or resume masters
+ if existing_masters:
+ print "Starting master..."
+ for inst in existing_masters:
+ if inst.state not in ["shutting-down", "terminated"]:
+ inst.start()
+ master_nodes = existing_masters
+ else:
+ master_type = opts.master_instance_type
+ if master_type == "":
+ master_type = opts.instance_type
+ if opts.zone == 'all':
+ opts.zone = random.choice(conn.get_all_zones()).name
+ master_res = image.run(key_name = opts.key_pair,
+ security_groups = [master_group],
+ instance_type = master_type,
+ placement = opts.zone,
+ min_count = 1,
+ max_count = 1,
+ block_device_map = block_map)
+ master_nodes = master_res.instances
+ print "Launched master in %s, regid = %s" % (zone, master_res.id)
# Return all the instances
return (master_nodes, slave_nodes)
@@ -403,8 +413,8 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
print slave.public_dns_name
ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar)
- modules = ['spark', 'shark', 'ephemeral-hdfs', 'persistent-hdfs',
- 'mapreduce', 'spark-standalone']
+ modules = ['spark', 'shark', 'ephemeral-hdfs', 'persistent-hdfs',
+ 'mapreduce', 'spark-standalone', 'tachyon']
if opts.hadoop_major_version == "1":
modules = filter(lambda x: x != "mapreduce", modules)
@@ -668,12 +678,12 @@ def real_main():
print "Terminating slaves..."
for inst in slave_nodes:
inst.terminate()
-
+
# Delete security groups as well
if opts.delete_groups:
print "Deleting security groups (this will take some time)..."
group_names = [cluster_name + "-master", cluster_name + "-slaves"]
-
+
attempt = 1;
while attempt <= 3:
print "Attempt %d" % attempt
@@ -731,6 +741,7 @@ def real_main():
cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " +
"BUT THE CLUSTER WILL KEEP USING SPACE ON\n" +
"AMAZON EBS IF IT IS EBS-BACKED!!\n" +
+ "All data on spot-instance slaves will be lost.\n" +
"Stop cluster " + cluster_name + " (y/N): ")
if response == "y":
(master_nodes, slave_nodes) = get_existing_cluster(
@@ -742,7 +753,10 @@ def real_main():
print "Stopping slaves..."
for inst in slave_nodes:
if inst.state not in ["shutting-down", "terminated"]:
- inst.stop()
+ if inst.spot_instance_request_id:
+ inst.terminate()
+ else:
+ inst.stop()
elif action == "start":
(master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)