[SPARK-14468] Always enable OutputCommitCoordinator

## What changes were proposed in this pull request? `OutputCommitCoordinator` was introduced to deal with concurrent task attempts racing to write output, leading to data loss or corruption. For more detail, read the [JIRA description](https://issues.apache.org/jira/browse/SPARK-14468). Before: `OutputCommitCoordinator` is enabled only if speculation is enabled. After: `OutputCommitCoordinator` is always enabled. Users may still disable this through `spark.hadoop.outputCommitCoordination.enabled`, but they really shouldn't... ## How was this patch tested? `OutputCommitCoordinator*Suite` Author: Andrew Or <andrew@databricks.com> Closes #12244 from andrewor14/always-occ.
author: Andrew Or <andrew@databricks.com> 2016-04-07 17:49:39 -0700
committer: Andrew Or <andrew@databricks.com> 2016-04-07 17:49:39 -0700
commit: 3e29e372ff518827bae9dcd26087946fde476843 (patch)
tree: 3abd3e22678fb63e347832e1241bf94fd2a8e6b9 /core
parent: 30e980ad8e6443dddd54f3c2d48b3904499545cf (diff)
download: spark-3e29e372ff518827bae9dcd26087946fde476843.tar.gz
spark-3e29e372ff518827bae9dcd26087946fde476843.tar.bz2
spark-3e29e372ff518827bae9dcd26087946fde476843.zip
3 files changed, 8 insertions, 12 deletions
diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
index 891facba33..607283a306 100644
--- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
@@ -33,11 +33,8 @@ object SparkHadoopMapRedUtil extends Logging {
    * the driver in order to determine whether this attempt can commit (please see SPARK-4879 for
    * details).
    *
-   * Output commit coordinator is only contacted when the following two configurations are both set
-   * to `true`:
-   *
-   *  - `spark.speculation`
-   *  - `spark.hadoop.outputCommitCoordination.enabled`
+   * Output commit coordinator is only used when `spark.hadoop.outputCommitCoordination.enabled`
+   * is set to true (which is the default).
    */
   def commitTask(
       committer: MapReduceOutputCommitter,
@@ -64,11 +61,10 @@ object SparkHadoopMapRedUtil extends Logging {
     if (committer.needsTaskCommit(mrTaskContext)) {
       val shouldCoordinateWithDriver: Boolean = {
         val sparkConf = SparkEnv.get.conf
-        // We only need to coordinate with the driver if there are multiple concurrent task
-        // attempts, which should only occur if speculation is enabled
-        val speculationEnabled = sparkConf.getBoolean("spark.speculation", defaultValue = false)
-        // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs
-        sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", speculationEnabled)
+        // We only need to coordinate with the driver if there are concurrent task attempts.
+        // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029).
+        // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs.
+        sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true)
       }
 
       if (shouldCoordinateWithDriver) {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
index 9f41aca8a1..601f1c378c 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala
@@ -38,7 +38,7 @@ class OutputCommitCoordinatorIntegrationSuite
     super.beforeAll()
     val conf = new SparkConf()
       .set("master", "local[2,4]")
-      .set("spark.speculation", "true")
+      .set("spark.hadoop.outputCommitCoordination.enabled", "true")
       .set("spark.hadoop.mapred.output.committer.class",
         classOf[ThrowExceptionOnFirstAttemptOutputCommitter].getCanonicalName)
     sc = new SparkContext("local[2, 4]", "test", conf)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index c461da65bd..8e509de767 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -77,7 +77,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
     val conf = new SparkConf()
       .setMaster("local[4]")
       .setAppName(classOf[OutputCommitCoordinatorSuite].getSimpleName)
-      .set("spark.speculation", "true")
+      .set("spark.hadoop.outputCommitCoordination.enabled", "true")
     sc = new SparkContext(conf) {
       override private[spark] def createSparkEnv(
           conf: SparkConf,
author	Andrew Or <andrew@databricks.com>	2016-04-07 17:49:39 -0700
committer	Andrew Or <andrew@databricks.com>	2016-04-07 17:49:39 -0700
commit	3e29e372ff518827bae9dcd26087946fde476843 (patch)
tree	3abd3e22678fb63e347832e1241bf94fd2a8e6b9 /core
parent	30e980ad8e6443dddd54f3c2d48b3904499545cf (diff)
download	spark-3e29e372ff518827bae9dcd26087946fde476843.tar.gz spark-3e29e372ff518827bae9dcd26087946fde476843.tar.bz2 spark-3e29e372ff518827bae9dcd26087946fde476843.zip