diff options
author | Mridul Muralidharan <mridul@gmail.com> | 2013-04-17 23:13:02 +0530 |
---|---|---|
committer | Mridul Muralidharan <mridul@gmail.com> | 2013-04-17 23:13:02 +0530 |
commit | f07961060d8d9dd85ab2a581adc45f886bb0e629 (patch) | |
tree | 02cea2719d5ddb8d40839bd0a7c883ed6c2a211c /core | |
parent | 5d891534fd5ca268f6ba7c9a47680846eb3a15ae (diff) | |
download | spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.gz spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.bz2 spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.zip |
Add a small note on spark.tasks.schedule.aggression
Diffstat (limited to 'core')
-rw-r--r-- | core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala index 2e18d46edc..a9d9c5e44c 100644 --- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala +++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala @@ -47,6 +47,11 @@ private[spark] class ClusterScheduler(val sc: SparkContext) - ANY Note that this property makes more sense when used in conjugation with spark.tasks.revive.interval > 0 : else it is not very effective. + + Additional Note: For non trivial clusters, there is a 4x - 5x reduction in running time (in some of our experiments) based on whether + it is left at default HOST_LOCAL, RACK_LOCAL (if cluster is configured to be rack aware) or ANY. + If cluster is rack aware, then setting it to RACK_LOCAL gives best tradeoff and a 3x - 4x performance improvement while minimizing IO impact. + Also, it brings down the variance in running time drastically. */ val TASK_SCHEDULING_AGGRESSION = TaskLocality.parse(System.getProperty("spark.tasks.schedule.aggression", "HOST_LOCAL")) @@ -68,7 +73,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext) val activeExecutorIds = new HashSet[String] // TODO: We might want to remove this and merge it with execId datastructures - but later. - // Which hosts in the cluster are alive (contains hostPort's) + // Which hosts in the cluster are alive (contains hostPort's) - used for hyper local and local task locality. private val hostPortsAlive = new HashSet[String] private val hostToAliveHostPorts = new HashMap[String, HashSet[String]] |