aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@apache.org>2014-08-11 19:15:01 -0700
committerJosh Rosen <joshrosen@apache.org>2014-08-11 19:15:01 -0700
commit7712e724ad69dd0b83754e938e9799d13a4d43b9 (patch)
tree74f2c4f7f9d31d3f2b7f641c8bb4b5667e10f3d9
parent37338666655909502e424b4639d680271d6d4c12 (diff)
downloadspark-7712e724ad69dd0b83754e938e9799d13a4d43b9.tar.gz
spark-7712e724ad69dd0b83754e938e9799d13a4d43b9.tar.bz2
spark-7712e724ad69dd0b83754e938e9799d13a4d43b9.zip
[SPARK-2931] In TaskSetManager, reset currentLocalityIndex after recomputing locality levels
This addresses SPARK-2931, a bug where getAllowedLocalityLevel() could throw ArrayIndexOutOfBoundsException. The fix here is to reset currentLocalityIndex after recomputing the locality levels. Thanks to kayousterhout, mridulm, and lirui-intel for helping me to debug this. Author: Josh Rosen <joshrosen@apache.org> Closes #1896 from JoshRosen/SPARK-2931 and squashes the following commits: 48b60b5 [Josh Rosen] Move FakeRackUtil.cleanUp() info beforeEach(). 6fec474 [Josh Rosen] Set currentLocalityIndex after recomputing locality levels. 9384897 [Josh Rosen] Update SPARK-2931 test to reflect changes in 63bdb1f41b4895e3a9444f7938094438a94d3007. 9ecd455 [Josh Rosen] Apply @mridulm's patch for reproducing SPARK-2931.
-rw-r--r--core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala11
-rw-r--r--core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala40
2 files changed, 46 insertions, 5 deletions
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 20a4bd12f9..d9d53faf84 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -690,8 +690,7 @@ private[spark] class TaskSetManager(
handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure)
}
// recalculate valid locality levels and waits when executor is lost
- myLocalityLevels = computeValidLocalityLevels()
- localityWaits = myLocalityLevels.map(getLocalityWait)
+ recomputeLocality()
}
/**
@@ -775,9 +774,15 @@ private[spark] class TaskSetManager(
levels.toArray
}
- def executorAdded() {
+ def recomputeLocality() {
+ val previousLocalityLevel = myLocalityLevels(currentLocalityIndex)
myLocalityLevels = computeValidLocalityLevels()
localityWaits = myLocalityLevels.map(getLocalityWait)
+ currentLocalityIndex = getLocalityIndex(previousLocalityLevel)
+ }
+
+ def executorAdded() {
+ recomputeLocality()
}
}
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index ffd23380a8..93e8ddacf8 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -154,6 +154,11 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
val LOCALITY_WAIT = conf.getLong("spark.locality.wait", 3000)
val MAX_TASK_FAILURES = 4
+ override def beforeEach() {
+ super.beforeEach()
+ FakeRackUtil.cleanUp()
+ }
+
test("TaskSet with no preferences") {
sc = new SparkContext("local", "test")
val sched = new FakeTaskScheduler(sc, ("exec1", "host1"))
@@ -471,7 +476,6 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
test("new executors get added and lost") {
// Assign host2 to rack2
- FakeRackUtil.cleanUp()
FakeRackUtil.assignHostToRack("host2", "rack2")
sc = new SparkContext("local", "test")
val sched = new FakeTaskScheduler(sc)
@@ -504,7 +508,6 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
}
test("test RACK_LOCAL tasks") {
- FakeRackUtil.cleanUp()
// Assign host1 to rack1
FakeRackUtil.assignHostToRack("host1", "rack1")
// Assign host2 to rack1
@@ -607,6 +610,39 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
assert(manager.resourceOffer("execA", "host3", NO_PREF).get.index === 2)
}
+ test("Ensure TaskSetManager is usable after addition of levels") {
+ // Regression test for SPARK-2931
+ sc = new SparkContext("local", "test")
+ val sched = new FakeTaskScheduler(sc)
+ val taskSet = FakeTask.createTaskSet(2,
+ Seq(TaskLocation("host1", "execA")),
+ Seq(TaskLocation("host2", "execB.1")))
+ val clock = new FakeClock
+ val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock)
+ // Only ANY is valid
+ assert(manager.myLocalityLevels.sameElements(Array(ANY)))
+ // Add a new executor
+ sched.addExecutor("execA", "host1")
+ sched.addExecutor("execB.2", "host2")
+ manager.executorAdded()
+ assert(manager.pendingTasksWithNoPrefs.size === 0)
+ // Valid locality should contain PROCESS_LOCAL, NODE_LOCAL and ANY
+ assert(manager.myLocalityLevels.sameElements(Array(PROCESS_LOCAL, NODE_LOCAL, ANY)))
+ assert(manager.resourceOffer("execA", "host1", ANY) !== None)
+ clock.advance(LOCALITY_WAIT * 4)
+ assert(manager.resourceOffer("execB.2", "host2", ANY) !== None)
+ sched.removeExecutor("execA")
+ sched.removeExecutor("execB.2")
+ manager.executorLost("execA", "host1")
+ manager.executorLost("execB.2", "host2")
+ clock.advance(LOCALITY_WAIT * 4)
+ sched.addExecutor("execC", "host3")
+ manager.executorAdded()
+ // Prior to the fix, this line resulted in an ArrayIndexOutOfBoundsException:
+ assert(manager.resourceOffer("execC", "host3", ANY) !== None)
+ }
+
+
def createTaskResult(id: Int): DirectTaskResult[Int] = {
val valueSer = SparkEnv.get.serializer.newInstance()
new DirectTaskResult[Int](valueSer.serialize(id), mutable.Map.empty, new TaskMetrics)