diff options
author | Matei Zaharia <matei@eecs.berkeley.edu> | 2013-12-06 11:03:32 -0800 |
---|---|---|
committer | Matei Zaharia <matei@eecs.berkeley.edu> | 2013-12-06 11:03:32 -0800 |
commit | 3fb302c08d078decd1fa7dd0fc008faff132ab7f (patch) | |
tree | bac6479a38db5c8b2a0280e5ef623d6ccc33b3b0 /core | |
parent | 87676a6af2c8fc33c5b5d4e7eb45e3e8558f3c33 (diff) | |
parent | 58b3aff9a871a38446aacc2d60b65199d44e56bb (diff) | |
download | spark-3fb302c08d078decd1fa7dd0fc008faff132ab7f.tar.gz spark-3fb302c08d078decd1fa7dd0fc008faff132ab7f.tar.bz2 spark-3fb302c08d078decd1fa7dd0fc008faff132ab7f.zip |
Merge pull request #205 from kayousterhout/logging
Added logging of scheduler delays to UI
This commit adds two metrics to the UI:
1) The time to get task results, if they're fetched remotely
2) The scheduler delay. When the scheduler starts getting overwhelmed (because it can't keep up with the rate at which tasks are being submitted), the result is that tasks get delayed on the tail-end: the message from the worker saying that the task has completed ends up in a long queue and takes a while to be processed by the scheduler. This commit records that delay in the UI so that users can tell when the scheduler is becoming the bottleneck.
Diffstat (limited to 'core')
-rw-r--r-- | core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index baccc4281a..69f9446bab 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -60,11 +60,13 @@ private[spark] class StagePage(parent: JobProgressUI) { var activeTime = 0L listener.stageIdToTasksActive(stageId).foreach(activeTime += _.timeRunning(now)) + val finishedTasks = listener.stageIdToTaskInfos(stageId).filter(_._1.finished) + val summary = <div> <ul class="unstyled"> <li> - <strong>CPU time: </strong> + <strong>Total duration across all tasks: </strong> {parent.formatDuration(listener.stageIdToTime.getOrElse(stageId, 0L) + activeTime)} </li> {if (hasShuffleRead) @@ -104,6 +106,33 @@ private[spark] class StagePage(parent: JobProgressUI) { val serviceQuantiles = "Duration" +: Distribution(serviceTimes).get.getQuantiles().map( ms => parent.formatDuration(ms.toLong)) + val gettingResultTimes = validTasks.map{case (info, metrics, exception) => + if (info.gettingResultTime > 0) { + (info.finishTime - info.gettingResultTime).toDouble + } else { + 0.0 + } + } + val gettingResultQuantiles = ("Time spent fetching task results" +: + Distribution(gettingResultTimes).get.getQuantiles().map( + millis => parent.formatDuration(millis.toLong))) + // The scheduler delay includes the network delay to send the task to the worker + // machine and to send back the result (but not the time to fetch the task result, + // if it needed to be fetched from the block manager on the worker). + val schedulerDelays = validTasks.map{case (info, metrics, exception) => + val totalExecutionTime = { + if (info.gettingResultTime > 0) { + (info.gettingResultTime - info.launchTime).toDouble + } else { + (info.finishTime - info.launchTime).toDouble + } + } + totalExecutionTime - metrics.get.executorRunTime + } + val schedulerDelayQuantiles = ("Scheduler delay" +: + Distribution(schedulerDelays).get.getQuantiles().map( + millis => parent.formatDuration(millis.toLong))) + def getQuantileCols(data: Seq[Double]) = Distribution(data).get.getQuantiles().map(d => Utils.bytesToString(d.toLong)) @@ -119,7 +148,10 @@ private[spark] class StagePage(parent: JobProgressUI) { } val shuffleWriteQuantiles = "Shuffle Write" +: getQuantileCols(shuffleWriteSizes) - val listings: Seq[Seq[String]] = Seq(serviceQuantiles, + val listings: Seq[Seq[String]] = Seq( + serviceQuantiles, + gettingResultQuantiles, + schedulerDelayQuantiles, if (hasShuffleRead) shuffleReadQuantiles else Nil, if (hasShuffleWrite) shuffleWriteQuantiles else Nil) |