aboutsummaryrefslogtreecommitdiff
path: root/core/src
diff options
context:
space:
mode:
authorMridul Muralidharan <mridul@gmail.com>2013-04-17 23:13:02 +0530
committerMridul Muralidharan <mridul@gmail.com>2013-04-17 23:13:02 +0530
commitf07961060d8d9dd85ab2a581adc45f886bb0e629 (patch)
tree02cea2719d5ddb8d40839bd0a7c883ed6c2a211c /core/src
parent5d891534fd5ca268f6ba7c9a47680846eb3a15ae (diff)
downloadspark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.gz
spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.tar.bz2
spark-f07961060d8d9dd85ab2a581adc45f886bb0e629.zip
Add a small note on spark.tasks.schedule.aggression
Diffstat (limited to 'core/src')
-rw-r--r--core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala7
1 files changed, 6 insertions, 1 deletions
diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
index 2e18d46edc..a9d9c5e44c 100644
--- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
@@ -47,6 +47,11 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
- ANY
Note that this property makes more sense when used in conjugation with spark.tasks.revive.interval > 0 : else it is not very effective.
+
+ Additional Note: For non trivial clusters, there is a 4x - 5x reduction in running time (in some of our experiments) based on whether
+ it is left at default HOST_LOCAL, RACK_LOCAL (if cluster is configured to be rack aware) or ANY.
+ If cluster is rack aware, then setting it to RACK_LOCAL gives best tradeoff and a 3x - 4x performance improvement while minimizing IO impact.
+ Also, it brings down the variance in running time drastically.
*/
val TASK_SCHEDULING_AGGRESSION = TaskLocality.parse(System.getProperty("spark.tasks.schedule.aggression", "HOST_LOCAL"))
@@ -68,7 +73,7 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
val activeExecutorIds = new HashSet[String]
// TODO: We might want to remove this and merge it with execId datastructures - but later.
- // Which hosts in the cluster are alive (contains hostPort's)
+ // Which hosts in the cluster are alive (contains hostPort's) - used for hyper local and local task locality.
private val hostPortsAlive = new HashSet[String]
private val hostToAliveHostPorts = new HashMap[String, HashSet[String]]