[SPARK-17637][SCHEDULER] Packed scheduling for Spark tasks across executors

## What changes were proposed in this pull request? Restructure the code and implement two new task assigner. PackedAssigner: try to allocate tasks to the executors with least available cores, so that spark can release reserved executors when dynamic allocation is enabled. BalancedAssigner: try to allocate tasks to the executors with more available cores in order to balance the workload across all executors. By default, the original round robin assigner is used. We test a pipeline, and new PackedAssigner save around 45% regarding the reserved cpu and memory with dynamic allocation enabled. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Both unit test in TaskSchedulerImplSuite and manual tests in production pipeline. Author: Zhan Zhang <zhanzhang@fb.com> Closes #15218 from zhzhan/packed-scheduler.
author: Zhan Zhang <zhanzhang@fb.com> 2016-10-15 18:45:04 -0700
committer: Mridul Muralidharan <mmuralidharan@HW11853.local> 2016-10-15 18:45:04 -0700
commit: ed1463341455830b8867b721a1b34f291139baf3 (patch)
tree: 02752c4bae9e4b1694b96370a8025cf28052832d /core/src/main
parent: 36d81c2c68ef4114592b069287743eb5cb078318 (diff)
download: spark-ed1463341455830b8867b721a1b34f291139baf3.tar.gz
spark-ed1463341455830b8867b721a1b34f291139baf3.tar.bz2
spark-ed1463341455830b8867b721a1b34f291139baf3.zip
2 files changed, 188 insertions, 19 deletions
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala
new file mode 100644
index 0000000000..62df9657a6
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskAssigner.scala
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.PriorityQueue
+import scala.util.Random
+
+import org.apache.spark.SparkConf
+
+case class OfferState(workOffer: WorkerOffer, var cores: Int) {
+  // Build a list of tasks to assign to each worker.
+  val tasks = new ArrayBuffer[TaskDescription](cores)
+}
+
+abstract class TaskAssigner(conf: SparkConf) {
+  var offer: Seq[OfferState] = _
+  val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1)
+
+  // The final assigned offer returned to TaskScheduler.
+  def tasks(): Seq[ArrayBuffer[TaskDescription]] = offer.map(_.tasks)
+
+  // construct the assigner by the workoffer.
+  def construct(workOffer: Seq[WorkerOffer]): Unit = {
+    offer = workOffer.map(o => OfferState(o, o.cores))
+  }
+
+  // Invoked in each round of Taskset assignment to initialize the internal structure.
+  def init(): Unit
+
+  // Indicating whether there is offer available to be used by one round of Taskset assignment.
+  def hasNext(): Boolean
+
+  // Next available offer returned to one round of Taskset assignment.
+  def getNext(): OfferState
+
+  // Called by the TaskScheduler to indicate whether the current offer is accepted
+  // In order to decide whether the current is valid for the next offering.
+  def taskAssigned(assigned: Boolean): Unit
+
+  // Release internally maintained resources. Subclass is responsible to
+  // release its own private resources.
+  def reset: Unit = {
+    offer = null
+  }
+}
+
+class RoundRobinAssigner(conf: SparkConf) extends TaskAssigner(conf) {
+  var i = 0
+  override def construct(workOffer: Seq[WorkerOffer]): Unit = {
+    offer = Random.shuffle(workOffer.map(o => OfferState(o, o.cores)))
+  }
+  override def init(): Unit = {
+    i = 0
+  }
+  override def hasNext: Boolean = {
+    i < offer.size
+  }
+  override def getNext(): OfferState = {
+    offer(i)
+  }
+  override def taskAssigned(assigned: Boolean): Unit = {
+    i += 1
+  }
+  override def reset: Unit = {
+    super.reset
+    i = 0
+  }
+}
+
+class BalancedAssigner(conf: SparkConf) extends TaskAssigner(conf) {
+  var maxHeap: PriorityQueue[OfferState] = _
+  var current: OfferState = _
+
+  override def construct(workOffer: Seq[WorkerOffer]): Unit = {
+    offer = Random.shuffle(workOffer.map(o => OfferState(o, o.cores)))
+  }
+  implicit val ord: Ordering[OfferState] = new Ordering[OfferState] {
+    def compare(x: OfferState, y: OfferState): Int = {
+      return Ordering[Int].compare(x.cores, y.cores)
+    }
+  }
+  def init(): Unit = {
+    maxHeap = new PriorityQueue[OfferState]()
+    offer.filter(_.cores >= CPUS_PER_TASK).foreach(maxHeap.enqueue(_))
+  }
+  override def hasNext: Boolean = {
+    maxHeap.size > 0
+  }
+  override def getNext(): OfferState = {
+    current = maxHeap.dequeue()
+    current
+  }
+
+  override def taskAssigned(assigned: Boolean): Unit = {
+    if (current.cores >= CPUS_PER_TASK && assigned) {
+      maxHeap.enqueue(current)
+    }
+  }
+  override def reset: Unit = {
+    super.reset
+    maxHeap = null
+    current = null
+  }
+}
+
+class PackedAssigner(conf: SparkConf) extends TaskAssigner(conf) {
+
+  var sorted: Seq[OfferState] = _
+  var i = 0
+  var current: OfferState = _
+
+  override def init(): Unit = {
+    i = 0
+    sorted = offer.filter(_.cores >= CPUS_PER_TASK).sortBy(_.cores)
+  }
+
+  override def hasNext: Boolean = {
+    i < sorted.size
+  }
+
+  override def getNext(): OfferState = {
+    current = sorted(i)
+    current
+  }
+
+  def taskAssigned(assigned: Boolean): Unit = {
+    if (current.cores < CPUS_PER_TASK || !assigned) {
+      i += 1
+    }
+  }
+
+  override def reset: Unit = {
+    super.reset
+    sorted = null
+    current = null
+    i = 0
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 3e3f1ad031..fb732ea8e5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -22,9 +22,7 @@ import java.util.{Timer, TimerTask}
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicLong
 
-import scala.collection.Set
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
-import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
@@ -61,6 +59,21 @@ private[spark] class TaskSchedulerImpl(
 
   val conf = sc.conf
 
+  val DEFAULT_TASK_ASSIGNER = classOf[RoundRobinAssigner].getName
+  lazy val taskAssigner: TaskAssigner = {
+    val className = conf.get("spark.task.assigner", DEFAULT_TASK_ASSIGNER)
+    try {
+      logInfo(s"""constructing assigner as $className""")
+      val ctor = Utils.classForName(className).getConstructor(classOf[SparkConf])
+      ctor.newInstance(conf).asInstanceOf[TaskAssigner]
+    } catch {
+      case _: Throwable =>
+        logWarning(
+          s"""$className cannot be constructed fallback to default
+             | $DEFAULT_TASK_ASSIGNER""".stripMargin)
+        new RoundRobinAssigner(conf)
+    }
+  }
   // How often to check for speculative tasks
   val SPECULATION_INTERVAL_MS = conf.getTimeAsMs("spark.speculation.interval", "100ms")
 
@@ -250,24 +263,26 @@ private[spark] class TaskSchedulerImpl(
   private def resourceOfferSingleTaskSet(
       taskSet: TaskSetManager,
       maxLocality: TaskLocality,
-      shuffledOffers: Seq[WorkerOffer],
-      availableCpus: Array[Int],
-      tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = {
+      taskAssigner: TaskAssigner) : Boolean = {
     var launchedTask = false
-    for (i <- 0 until shuffledOffers.size) {
-      val execId = shuffledOffers(i).executorId
-      val host = shuffledOffers(i).host
-      if (availableCpus(i) >= CPUS_PER_TASK) {
+    taskAssigner.init()
+    while(taskAssigner.hasNext()) {
+      var assigned = false
+      val current = taskAssigner.getNext()
+      val execId = current.workOffer.executorId
+      val host = current.workOffer.host
+      if (current.cores >= CPUS_PER_TASK) {
         try {
           for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
-            tasks(i) += task
+            current.tasks += task
             val tid = task.taskId
             taskIdToTaskSetManager(tid) = taskSet
             taskIdToExecutorId(tid) = execId
             executorIdToTaskCount(execId) += 1
-            availableCpus(i) -= CPUS_PER_TASK
-            assert(availableCpus(i) >= 0)
+            current.cores = current.cores - CPUS_PER_TASK
+            assert(current.cores >= 0)
             launchedTask = true
+            assigned = true
           }
         } catch {
           case e: TaskNotSerializableException =>
@@ -277,8 +292,10 @@ private[spark] class TaskSchedulerImpl(
             return launchedTask
         }
       }
+      taskAssigner.taskAssigned(assigned)
     }
     return launchedTask
+
   }
 
   /**
@@ -305,12 +322,8 @@ private[spark] class TaskSchedulerImpl(
         hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
       }
     }
+    taskAssigner.construct(offers)
 
-    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.
-    val shuffledOffers = Random.shuffle(offers)
-    // Build a list of tasks to assign to each worker.
-    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
-    val availableCpus = shuffledOffers.map(o => o.cores).toArray
     val sortedTaskSets = rootPool.getSortedTaskSetQueue
     for (taskSet <- sortedTaskSets) {
       logDebug("parentName: %s, name: %s, runningTasks: %s".format(
@@ -329,7 +342,7 @@ private[spark] class TaskSchedulerImpl(
       for (currentMaxLocality <- taskSet.myLocalityLevels) {
         do {
           launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(
-            taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
+            taskSet, currentMaxLocality, taskAssigner)
           launchedAnyTask |= launchedTaskAtCurrentMaxLocality
         } while (launchedTaskAtCurrentMaxLocality)
       }
@@ -337,10 +350,12 @@ private[spark] class TaskSchedulerImpl(
         taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
       }
     }
-
+    val tasks = taskAssigner.tasks
+    taskAssigner.reset
     if (tasks.size > 0) {
       hasLaunchedTask = true
     }
+
     return tasks
   }
author	Zhan Zhang <zhanzhang@fb.com>	2016-10-15 18:45:04 -0700
committer	Mridul Muralidharan <mmuralidharan@HW11853.local>	2016-10-15 18:45:04 -0700
commit	ed1463341455830b8867b721a1b34f291139baf3 (patch)
tree	02752c4bae9e4b1694b96370a8025cf28052832d /core/src/main
parent	36d81c2c68ef4114592b069287743eb5cb078318 (diff)
download	spark-ed1463341455830b8867b721a1b34f291139baf3.tar.gz spark-ed1463341455830b8867b721a1b34f291139baf3.tar.bz2 spark-ed1463341455830b8867b721a1b34f291139baf3.zip