From f97ce3ae14ed05b3e5d3e6cd137ee5164813634e Mon Sep 17 00:00:00 2001 From: Peter Sankauskas Date: Tue, 11 Dec 2012 10:48:21 -0800 Subject: SPARK-626: Making security group deletion optional, handling retried when deleting security groups fails, fixing bug when using all zones but only 1 slave. --- ec2/spark_ec2.py | 82 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 27 deletions(-) (limited to 'ec2/spark_ec2.py') diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 2e8d2e17f5..2cc8431238 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -30,6 +30,7 @@ import time import urllib2 from optparse import OptionParser from sys import stderr +import boto from boto.ec2.blockdevicemapping import BlockDeviceMapping, EBSBlockDeviceType from boto import ec2 @@ -85,6 +86,8 @@ def parse_args(): help="'mesos' for a mesos cluster, 'standalone' for a standalone spark cluster (default: mesos)") parser.add_option("-u", "--user", default="root", help="The ssh user you want to connect as (default: root)") + parser.add_option("--delete-groups", action="store_true", default=False, + help="When destroying a cluster, also destroy the security groups that were created") (opts, args) = parser.parse_args() if len(args) != 2: @@ -283,16 +286,17 @@ def launch_cluster(conn, opts, cluster_name): slave_nodes = [] for zone in zones: num_slaves_this_zone = get_partition(opts.slaves, num_zones, i) - slave_res = image.run(key_name = opts.key_pair, - security_groups = [slave_group], - instance_type = opts.instance_type, - placement = zone, - min_count = num_slaves_this_zone, - max_count = num_slaves_this_zone, - block_device_map = block_map) - slave_nodes += slave_res.instances - print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, - zone, slave_res.id) + if num_slaves_this_zone > 0: + slave_res = image.run(key_name = opts.key_pair, + security_groups = [slave_group], + instance_type = opts.instance_type, + placement = zone, + min_count = num_slaves_this_zone, + max_count = num_slaves_this_zone, + block_device_map = block_map) + slave_nodes += slave_res.instances + print "Launched %d slaves in %s, regid = %s" % (num_slaves_this_zone, + zone, slave_res.id) i += 1 # Launch masters @@ -555,24 +559,48 @@ def main(): print "Terminating zoo..." for inst in zoo_nodes: inst.terminate() + # Delete security groups as well - group_names = [cluster_name + "-master", cluster_name + "-slaves", cluster_name + "-zoo"] - groups = [g for g in conn.get_all_security_groups() if g.name in group_names] - # Delete individual rules in all groups before deleting groups to remove - # dependencies between them - for group in groups: - print "Deleting rules in security group " + group.name - for rule in group.rules: - for grant in rule.grants: - group.revoke(ip_protocol=rule.ip_protocol, - from_port=rule.from_port, - to_port=rule.to_port, - src_group=grant) - # Sleep for AWS eventual-consistency to catch up - time.sleep(30) # Yes, it does have to be this long :-( - for group in groups: - print "Deleting security group " + group.name - conn.delete_security_group(group.name) + if opts.delete_groups: + print "Deleting security groups (this will take some time)..." + group_names = [cluster_name + "-master", cluster_name + "-slaves", cluster_name + "-zoo"] + + attempt = 1; + while attempt <= 3: + print "Attempt %d" % attempt + groups = [g for g in conn.get_all_security_groups() if g.name in group_names] + success = True + # Delete individual rules in all groups before deleting groups to + # remove dependencies between them + for group in groups: + print "Deleting rules in security group " + group.name + for rule in group.rules: + for grant in rule.grants: + success &= group.revoke(ip_protocol=rule.ip_protocol, + from_port=rule.from_port, + to_port=rule.to_port, + src_group=grant) + + # Sleep for AWS eventual-consistency to catch up, and for instances + # to terminate + time.sleep(30) # Yes, it does have to be this long :-( + for group in groups: + try: + conn.delete_security_group(group.name) + print "Deleted security group " + group.name + except boto.exception.EC2ResponseError: + success = False; + print "Failed to delete security group " + group.name + + # Unfortunately, group.revoke() returns True even if a rule was not + # deleted, so this needs to be rerun if something fails + if success: break; + + attempt += 1 + + if not success: + print "Failed to delete all security groups after 3 tries." + print "Try re-running in a few minutes." elif action == "login": (master_nodes, slave_nodes, zoo_nodes) = get_existing_cluster( -- cgit v1.2.3