From 14a1df65729a3c7e5fd54f79424626a540aef5d4 Mon Sep 17 00:00:00 2001 From: Grega Kespret Date: Mon, 9 Dec 2013 10:39:02 +0100 Subject: Fix for spark.task.maxFailures not enforced correctly. --- .../org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala index 94961790df..bf494aa64d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManager.scala @@ -529,10 +529,10 @@ private[spark] class ClusterTaskSetManager( addPendingTask(index) if (state != TaskState.KILLED) { numFailures(index) += 1 - if (numFailures(index) > MAX_TASK_FAILURES) { - logError("Task %s:%d failed more than %d times; aborting job".format( + if (numFailures(index) >= MAX_TASK_FAILURES) { + logError("Task %s:%d failed %d times; aborting job".format( taskSet.id, index, MAX_TASK_FAILURES)) - abort("Task %s:%d failed more than %d times".format(taskSet.id, index, MAX_TASK_FAILURES)) + abort("Task %s:%d failed %d times".format(taskSet.id, index, MAX_TASK_FAILURES)) } } } else { -- cgit v1.2.3 From 558af873340087cad79630ec5c498672c5ea3c4f Mon Sep 17 00:00:00 2001 From: Grega Kespret Date: Tue, 10 Dec 2013 11:43:42 +0100 Subject: Fix tests. --- core/src/test/scala/org/apache/spark/DistributedSuite.scala | 2 +- .../org/apache/spark/scheduler/cluster/ClusterTaskSetManagerSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index 480bac84f3..816cbee3e0 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -122,7 +122,7 @@ class DistributedSuite extends FunSuite with ShouldMatchers with BeforeAndAfter sc.parallelize(1 to 10, 10).foreach(x => println(x / 0)) } assert(thrown.getClass === classOf[SparkException]) - assert(thrown.getMessage.contains("more than 4 times")) + assert(thrown.getMessage.contains("failed 4 times")) } test("caching") { diff --git a/core/src/test/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManagerSuite.scala index b97f2b19b5..29c4cc5d9c 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/cluster/ClusterTaskSetManagerSuite.scala @@ -283,7 +283,7 @@ class ClusterTaskSetManagerSuite extends FunSuite with LocalSparkContext with Lo // Fail the task MAX_TASK_FAILURES times, and check that the task set is aborted // after the last failure. - (0 until manager.MAX_TASK_FAILURES).foreach { index => + (1 to manager.MAX_TASK_FAILURES).foreach { index => val offerResult = manager.resourceOffer("exec1", "host1", 1, ANY) assert(offerResult != None, "Expect resource offer on iteration %s to return a task".format(index)) -- cgit v1.2.3 From d17c142615f7d0ff514a74779e433107659a78b9 Mon Sep 17 00:00:00 2001 From: Ewen Cheslack-Postava Date: Mon, 16 Dec 2013 08:09:37 -0800 Subject: Force pseudo-tty allocation in spark-ec2 script. ssh commands need the -t argument repeated twice if there is no local tty, e.g. if the process running spark-ec2 uses nohup and the parent process exits. --- ec2/spark_ec2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 1189232428..a2b0e7e7f4 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -589,7 +589,7 @@ def ssh(host, opts, command): while True: try: return subprocess.check_call( - ssh_command(opts) + ['-t', '%s@%s' % (opts.user, host), stringify_command(command)]) + ssh_command(opts) + ['-t', '-t', '%s@%s' % (opts.user, host), stringify_command(command)]) except subprocess.CalledProcessError as e: if (tries > 2): # If this was an ssh failure, provide the user with hints. @@ -730,7 +730,7 @@ def real_main(): if opts.proxy_port != None: proxy_opt = ['-D', opts.proxy_port] subprocess.check_call( - ssh_command(opts) + proxy_opt + ['-t', "%s@%s" % (opts.user, master)]) + ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)]) elif action == "get-master": (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name) -- cgit v1.2.3