Add a small note on spark.tasks.schedule.aggression

author: Mridul Muralidharan <mridul@gmail.com> 2013-04-17 23:13:02 +0530
committer: Mridul Muralidharan <mridul@gmail.com> 2013-04-17 23:13:02 +0530
commit: f07961060d8d9dd85ab2a581adc45f886bb0e629 (patch)
tree: 02cea2719d5ddb8d40839bd0a7c883ed6c2a211c /core/src
parent: 5d891534fd5ca268f6ba7c9a47680846eb3a15ae (diff)
download: spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.gz
spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.bz2
spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.zip
1 files changed, 6 insertions, 1 deletions
diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
index 2e18d46edc..a9d9c5e44c 100644
--- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
@@ -47,6 +47,11 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
    - ANY
 
    Note that this property makes more sense when used in conjugation with spark.tasks.revive.interval > 0 : else it is not very effective.
+
+   Additional Note: For non trivial clusters, there is a 4x - 5x reduction in running time (in some of our experiments) based on whether
+   it is left at default HOST_LOCAL, RACK_LOCAL (if cluster is configured to be rack aware) or ANY.
+   If cluster is rack aware, then setting it to RACK_LOCAL gives best tradeoff and a 3x - 4x performance improvement while minimizing IO impact.
+   Also, it brings down the variance in running time drastically.
     */
   val TASK_SCHEDULING_AGGRESSION = TaskLocality.parse(System.getProperty("spark.tasks.schedule.aggression", "HOST_LOCAL"))
 
@@ -68,7 +73,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
   val activeExecutorIds = new HashSet[String]
 
   // TODO: We might want to remove this and merge it with execId datastructures - but later.
-  // Which hosts in the cluster are alive (contains hostPort's)
+  // Which hosts in the cluster are alive (contains hostPort's) - used for hyper local and local task locality.
   private val hostPortsAlive = new HashSet[String]
   private val hostToAliveHostPorts = new HashMap[String, HashSet[String]]
author	Mridul Muralidharan <mridul@gmail.com>	2013-04-17 23:13:02 +0530
committer	Mridul Muralidharan <mridul@gmail.com>	2013-04-17 23:13:02 +0530
commit	f07961060d8d9dd85ab2a581adc45f886bb0e629 (patch)
tree	02cea2719d5ddb8d40839bd0a7c883ed6c2a211c /core/src
parent	5d891534fd5ca268f6ba7c9a47680846eb3a15ae (diff)
download	spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.gz spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.bz2 spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.zip