From 17b72d31e0c59711eddeb525becb8085930eadcc Mon Sep 17 00:00:00 2001 From: Dhruve Ashar Date: Thu, 22 Sep 2016 10:10:37 -0700 Subject: [SPARK-17365][CORE] Remove/Kill multiple executors together to reduce RPC call time. ## What changes were proposed in this pull request? We are killing multiple executors together instead of iterating over expensive RPC calls to kill single executor. ## How was this patch tested? Executed sample spark job to observe executors being killed/removed with dynamic allocation enabled. Author: Dhruve Ashar Author: Dhruve Ashar Closes #15152 from dhruve/impr/SPARK-17365. --- .../spark/ExecutorAllocationManagerSuite.scala | 135 +++++++++++++++++++-- .../deploy/StandaloneDynamicAllocationSuite.scala | 6 +- 2 files changed, 128 insertions(+), 13 deletions(-) (limited to 'core/src/test/scala/org/apache') diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index c130649830..ec409712b9 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -23,7 +23,9 @@ import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.executor.TaskMetrics import org.apache.spark.scheduler._ +import org.apache.spark.scheduler.ExternalClusterManager import org.apache.spark.scheduler.cluster.ExecutorInfo +import org.apache.spark.scheduler.local.LocalSchedulerBackend import org.apache.spark.util.ManualClock /** @@ -49,7 +51,7 @@ class ExecutorAllocationManagerSuite test("verify min/max executors") { val conf = new SparkConf() - .setMaster("local") + .setMaster("myDummyLocalExternalClusterManager") .setAppName("test-executor-allocation-manager") .set("spark.dynamicAllocation.enabled", "true") .set("spark.dynamicAllocation.testing", "true") @@ -263,6 +265,55 @@ class ExecutorAllocationManagerSuite assert(executorsPendingToRemove(manager).isEmpty) } + test("remove multiple executors") { + sc = createSparkContext(5, 10, 5) + val manager = sc.executorAllocationManager.get + (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id) } + + // Keep removing until the limit is reached + assert(executorsPendingToRemove(manager).isEmpty) + assert(removeExecutors(manager, Seq("1")) === Seq("1")) + assert(executorsPendingToRemove(manager).size === 1) + assert(executorsPendingToRemove(manager).contains("1")) + assert(removeExecutors(manager, Seq("2", "3")) === Seq("2", "3")) + assert(executorsPendingToRemove(manager).size === 3) + assert(executorsPendingToRemove(manager).contains("2")) + assert(executorsPendingToRemove(manager).contains("3")) + assert(!removeExecutor(manager, "100")) // remove non-existent executors + assert(removeExecutors(manager, Seq("101", "102")) !== Seq("101", "102")) + assert(executorsPendingToRemove(manager).size === 3) + assert(removeExecutor(manager, "4")) + assert(removeExecutors(manager, Seq("5")) === Seq("5")) + assert(!removeExecutor(manager, "6")) // reached the limit of 5 + assert(executorsPendingToRemove(manager).size === 5) + assert(executorsPendingToRemove(manager).contains("4")) + assert(executorsPendingToRemove(manager).contains("5")) + assert(!executorsPendingToRemove(manager).contains("6")) + + // Kill executors previously requested to remove + onExecutorRemoved(manager, "1") + assert(executorsPendingToRemove(manager).size === 4) + assert(!executorsPendingToRemove(manager).contains("1")) + onExecutorRemoved(manager, "2") + onExecutorRemoved(manager, "3") + assert(executorsPendingToRemove(manager).size === 2) + assert(!executorsPendingToRemove(manager).contains("2")) + assert(!executorsPendingToRemove(manager).contains("3")) + onExecutorRemoved(manager, "2") // duplicates should not count + onExecutorRemoved(manager, "3") + assert(executorsPendingToRemove(manager).size === 2) + onExecutorRemoved(manager, "4") + onExecutorRemoved(manager, "5") + assert(executorsPendingToRemove(manager).isEmpty) + + // Try removing again + // This should still fail because the number pending + running is still at the limit + assert(!removeExecutor(manager, "7")) + assert(executorsPendingToRemove(manager).isEmpty) + assert(removeExecutors(manager, Seq("8")) !== Seq("8")) + assert(executorsPendingToRemove(manager).isEmpty) + } + test ("interleaving add and remove") { sc = createSparkContext(5, 10, 5) val manager = sc.executorAllocationManager.get @@ -283,8 +334,7 @@ class ExecutorAllocationManagerSuite // Remove until limit assert(removeExecutor(manager, "1")) - assert(removeExecutor(manager, "2")) - assert(removeExecutor(manager, "3")) + assert(removeExecutors(manager, Seq("2", "3")) === Seq("2", "3")) assert(!removeExecutor(manager, "4")) // lower limit reached assert(!removeExecutor(manager, "5")) onExecutorRemoved(manager, "1") @@ -296,7 +346,7 @@ class ExecutorAllocationManagerSuite assert(addExecutors(manager) === 2) // upper limit reached assert(addExecutors(manager) === 0) assert(!removeExecutor(manager, "4")) // still at lower limit - assert(!removeExecutor(manager, "5")) + assert((manager, Seq("5")) !== Seq("5")) onExecutorAdded(manager, "9") onExecutorAdded(manager, "10") onExecutorAdded(manager, "11") @@ -305,9 +355,7 @@ class ExecutorAllocationManagerSuite assert(executorIds(manager).size === 10) // Remove succeeds again, now that we are no longer at the lower limit - assert(removeExecutor(manager, "4")) - assert(removeExecutor(manager, "5")) - assert(removeExecutor(manager, "6")) + assert(removeExecutors(manager, Seq("4", "5", "6")) === Seq("4", "5", "6")) assert(removeExecutor(manager, "7")) assert(executorIds(manager).size === 10) assert(addExecutors(manager) === 0) @@ -870,8 +918,8 @@ class ExecutorAllocationManagerSuite assert(executorIds(manager) === Set("first", "second", "third", "fourth", "fifth")) removeExecutor(manager, "first") - removeExecutor(manager, "second") - assert(executorsPendingToRemove(manager) === Set("first", "second")) + removeExecutors(manager, Seq("second", "third")) + assert(executorsPendingToRemove(manager) === Set("first", "second", "third")) assert(executorIds(manager) === Set("first", "second", "third", "fourth", "fifth")) @@ -895,7 +943,7 @@ class ExecutorAllocationManagerSuite maxExecutors: Int = 5, initialExecutors: Int = 1): SparkContext = { val conf = new SparkConf() - .setMaster("local") + .setMaster("myDummyLocalExternalClusterManager") .setAppName("test-executor-allocation-manager") .set("spark.dynamicAllocation.enabled", "true") .set("spark.dynamicAllocation.minExecutors", minExecutors.toString) @@ -953,6 +1001,7 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester { private val _updateAndSyncNumExecutorsTarget = PrivateMethod[Int]('updateAndSyncNumExecutorsTarget) private val _removeExecutor = PrivateMethod[Boolean]('removeExecutor) + private val _removeExecutors = PrivateMethod[Seq[String]]('removeExecutors) private val _onExecutorAdded = PrivateMethod[Unit]('onExecutorAdded) private val _onExecutorRemoved = PrivateMethod[Unit]('onExecutorRemoved) private val _onSchedulerBacklogged = PrivateMethod[Unit]('onSchedulerBacklogged) @@ -1008,6 +1057,10 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester { manager invokePrivate _removeExecutor(id) } + private def removeExecutors(manager: ExecutorAllocationManager, ids: Seq[String]): Seq[String] = { + manager invokePrivate _removeExecutors(ids) + } + private def onExecutorAdded(manager: ExecutorAllocationManager, id: String): Unit = { manager invokePrivate _onExecutorAdded(id) } @@ -1040,3 +1093,65 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester { manager invokePrivate _hostToLocalTaskCount() } } + +/** + * A cluster manager which wraps around the scheduler and backend for local mode. It is used for + * testing the dynamic allocation policy. + */ +private class DummyLocalExternalClusterManager extends ExternalClusterManager { + + def canCreate(masterURL: String): Boolean = masterURL == "myDummyLocalExternalClusterManager" + + override def createTaskScheduler( + sc: SparkContext, + masterURL: String): TaskScheduler = new TaskSchedulerImpl(sc, 1, isLocal = true) + + override def createSchedulerBackend( + sc: SparkContext, + masterURL: String, + scheduler: TaskScheduler): SchedulerBackend = { + val sb = new LocalSchedulerBackend(sc.getConf, scheduler.asInstanceOf[TaskSchedulerImpl], 1) + new DummyLocalSchedulerBackend(sc, sb) + } + + override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { + val sc = scheduler.asInstanceOf[TaskSchedulerImpl] + sc.initialize(backend) + } +} + +/** + * A scheduler backend which wraps around local scheduler backend and exposes the executor + * allocation client interface for testing dynamic allocation. + */ +private class DummyLocalSchedulerBackend (sc: SparkContext, sb: SchedulerBackend) + extends SchedulerBackend with ExecutorAllocationClient { + + override private[spark] def getExecutorIds(): Seq[String] = sc.getExecutorIds() + + override private[spark] def requestTotalExecutors( + numExecutors: Int, + localityAwareTasks: Int, + hostToLocalTaskCount: Map[String, Int]): Boolean = + sc.requestTotalExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount) + + override def requestExecutors(numAdditionalExecutors: Int): Boolean = + sc.requestExecutors(numAdditionalExecutors) + + override def killExecutors(executorIds: Seq[String]): Seq[String] = { + val response = sc.killExecutors(executorIds) + if (response) { + executorIds + } else { + Seq.empty[String] + } + } + + override def start(): Unit = sb.start() + + override def stop(): Unit = sb.stop() + + override def reviveOffers(): Unit = sb.reviveOffers() + + override def defaultParallelism(): Int = sb.defaultParallelism() +} diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index 814027076d..e29eb8552e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -438,12 +438,12 @@ class StandaloneDynamicAllocationSuite val executorIdToTaskCount = taskScheduler invokePrivate getMap() executorIdToTaskCount(executors.head) = 1 // kill the busy executor without force; this should fail - assert(!killExecutor(sc, executors.head, force = false)) + assert(killExecutor(sc, executors.head, force = false).isEmpty) apps = getApplications() assert(apps.head.executors.size === 2) // force kill busy executor - assert(killExecutor(sc, executors.head, force = true)) + assert(killExecutor(sc, executors.head, force = true).nonEmpty) apps = getApplications() // kill executor successfully assert(apps.head.executors.size === 1) @@ -518,7 +518,7 @@ class StandaloneDynamicAllocationSuite } /** Kill the given executor, specifying whether to force kill it. */ - private def killExecutor(sc: SparkContext, executorId: String, force: Boolean): Boolean = { + private def killExecutor(sc: SparkContext, executorId: String, force: Boolean): Seq[String] = { syncExecutors(sc) sc.schedulerBackend match { case b: CoarseGrainedSchedulerBackend => -- cgit v1.2.3