aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorMatei Zaharia <matei@eecs.berkeley.edu>2013-12-06 11:03:32 -0800
committerMatei Zaharia <matei@eecs.berkeley.edu>2013-12-06 11:03:32 -0800
commit3fb302c08d078decd1fa7dd0fc008faff132ab7f (patch)
treebac6479a38db5c8b2a0280e5ef623d6ccc33b3b0 /core
parent87676a6af2c8fc33c5b5d4e7eb45e3e8558f3c33 (diff)
parent58b3aff9a871a38446aacc2d60b65199d44e56bb (diff)
downloadspark-3fb302c08d078decd1fa7dd0fc008faff132ab7f.tar.gz
spark-3fb302c08d078decd1fa7dd0fc008faff132ab7f.tar.bz2
spark-3fb302c08d078decd1fa7dd0fc008faff132ab7f.zip
Merge pull request #205 from kayousterhout/logging
Added logging of scheduler delays to UI This commit adds two metrics to the UI: 1) The time to get task results, if they're fetched remotely 2) The scheduler delay. When the scheduler starts getting overwhelmed (because it can't keep up with the rate at which tasks are being submitted), the result is that tasks get delayed on the tail-end: the message from the worker saying that the task has completed ends up in a long queue and takes a while to be processed by the scheduler. This commit records that delay in the UI so that users can tell when the scheduler is becoming the bottleneck.
Diffstat (limited to 'core')
-rw-r--r--core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala36
1 files changed, 34 insertions, 2 deletions
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index baccc4281a..69f9446bab 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -60,11 +60,13 @@ private[spark] class StagePage(parent: JobProgressUI) {
var activeTime = 0L
listener.stageIdToTasksActive(stageId).foreach(activeTime += _.timeRunning(now))
+ val finishedTasks = listener.stageIdToTaskInfos(stageId).filter(_._1.finished)
+
val summary =
<div>
<ul class="unstyled">
<li>
- <strong>CPU time: </strong>
+ <strong>Total duration across all tasks: </strong>
{parent.formatDuration(listener.stageIdToTime.getOrElse(stageId, 0L) + activeTime)}
</li>
{if (hasShuffleRead)
@@ -104,6 +106,33 @@ private[spark] class StagePage(parent: JobProgressUI) {
val serviceQuantiles = "Duration" +: Distribution(serviceTimes).get.getQuantiles().map(
ms => parent.formatDuration(ms.toLong))
+ val gettingResultTimes = validTasks.map{case (info, metrics, exception) =>
+ if (info.gettingResultTime > 0) {
+ (info.finishTime - info.gettingResultTime).toDouble
+ } else {
+ 0.0
+ }
+ }
+ val gettingResultQuantiles = ("Time spent fetching task results" +:
+ Distribution(gettingResultTimes).get.getQuantiles().map(
+ millis => parent.formatDuration(millis.toLong)))
+ // The scheduler delay includes the network delay to send the task to the worker
+ // machine and to send back the result (but not the time to fetch the task result,
+ // if it needed to be fetched from the block manager on the worker).
+ val schedulerDelays = validTasks.map{case (info, metrics, exception) =>
+ val totalExecutionTime = {
+ if (info.gettingResultTime > 0) {
+ (info.gettingResultTime - info.launchTime).toDouble
+ } else {
+ (info.finishTime - info.launchTime).toDouble
+ }
+ }
+ totalExecutionTime - metrics.get.executorRunTime
+ }
+ val schedulerDelayQuantiles = ("Scheduler delay" +:
+ Distribution(schedulerDelays).get.getQuantiles().map(
+ millis => parent.formatDuration(millis.toLong)))
+
def getQuantileCols(data: Seq[Double]) =
Distribution(data).get.getQuantiles().map(d => Utils.bytesToString(d.toLong))
@@ -119,7 +148,10 @@ private[spark] class StagePage(parent: JobProgressUI) {
}
val shuffleWriteQuantiles = "Shuffle Write" +: getQuantileCols(shuffleWriteSizes)
- val listings: Seq[Seq[String]] = Seq(serviceQuantiles,
+ val listings: Seq[Seq[String]] = Seq(
+ serviceQuantiles,
+ gettingResultQuantiles,
+ schedulerDelayQuantiles,
if (hasShuffleRead) shuffleReadQuantiles else Nil,
if (hasShuffleWrite) shuffleWriteQuantiles else Nil)