[SPARK-17433] YarnShuffleService doesn't handle moving credentials levelDb

The secrets leveldb isn't being moved if you run spark shuffle services without yarn nm recovery on and then turn it on. This fixes that. I unfortunately missed this when I ported the patch from our internal branch 2 to master branch due to the changes for the recovery path. Note this only applies to master since it is the only place the yarn nm recovery dir is used. Unit tests ran and tested on 8 node cluster. Fresh startup with NM recovery, fresh startup no nm recovery, switching between no nm recovery and recovery. Also tested running applications to make sure wasn't affected by rolling upgrade. Author: Thomas Graves <tgraves@prevailsail.corp.gq1.yahoo.com> Author: Tom Graves <tgraves@apache.org> Closes #14999 from tgravescs/SPARK-17433.
author: Thomas Graves <tgraves@prevailsail.corp.gq1.yahoo.com> 2016-09-09 13:43:32 -0500
committer: Tom Graves <tgraves@yahoo-inc.com> 2016-09-09 13:43:32 -0500
commit: a3981c28c956a82ccf5b1c61d45b6bd252d4abed (patch)
tree: 72e420d0b8111a5c33a3d57804aa32c76531f672 /common
parent: 7098a12945e71a159784836b75da855a603e1631 (diff)
download: spark-a3981c28c956a82ccf5b1c61d45b6bd252d4abed.tar.gz
spark-a3981c28c956a82ccf5b1c61d45b6bd252d4abed.tar.bz2
spark-a3981c28c956a82ccf5b1c61d45b6bd252d4abed.zip
1 files changed, 39 insertions, 17 deletions
diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index df082e4a92..43c8df721d 100644
--- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.nio.ByteBuffer;
+import java.nio.file.Files;
 import java.util.List;
 import java.util.Map;
 
@@ -159,8 +160,7 @@ public class YarnShuffleService extends AuxiliaryService {
       // If we don't find one, then we choose a file to use to save the state next time.  Even if
       // an application was stopped while the NM was down, we expect yarn to call stopApplication()
       // when it comes back
-      registeredExecutorFile =
-        new File(getRecoveryPath().toUri().getPath(), RECOVERY_FILE_NAME);
+      registeredExecutorFile = initRecoveryDb(RECOVERY_FILE_NAME);
 
       TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf));
       blockHandler = new ExternalShuffleBlockHandler(transportConf, registeredExecutorFile);
@@ -196,7 +196,7 @@ public class YarnShuffleService extends AuxiliaryService {
 
   private void createSecretManager() throws IOException {
     secretManager = new ShuffleSecretManager();
-    secretsFile = new File(getRecoveryPath().toUri().getPath(), SECRETS_RECOVERY_FILE_NAME);
+    secretsFile = initRecoveryDb(SECRETS_RECOVERY_FILE_NAME);
  
     // Make sure this is protected in case its not in the NM recovery dir
     FileSystem fs = FileSystem.getLocal(_conf);
@@ -328,37 +328,59 @@ public class YarnShuffleService extends AuxiliaryService {
   }
 
   /**
-   * Get the recovery path, this will override the default one to get our own maintained
-   * recovery path.
+   * Get the path specific to this auxiliary service to use for recovery.
+   */ 
+  protected Path getRecoveryPath(String fileName) {
+    return _recoveryPath;
+  }
+
+  /**
+   * Figure out the recovery path and handle moving the DB if YARN NM recovery gets enabled
+   * when it previously was not. If YARN NM recovery is enabled it uses that path, otherwise
+   * it will uses a YARN local dir.
    */
-  protected Path getRecoveryPath() {
+  protected File initRecoveryDb(String dbFileName) {
+    if (_recoveryPath != null) {
+        File recoveryFile = new File(_recoveryPath.toUri().getPath(), dbFileName);
+        if (recoveryFile.exists()) {
+          return recoveryFile;
+        }
+    } 
+    // db doesn't exist in recovery path go check local dirs for it
     String[] localDirs = _conf.getTrimmedStrings("yarn.nodemanager.local-dirs");
     for (String dir : localDirs) {
-      File f = new File(new Path(dir).toUri().getPath(), RECOVERY_FILE_NAME);
+      File f = new File(new Path(dir).toUri().getPath(), dbFileName);
       if (f.exists()) {
         if (_recoveryPath == null) {
           // If NM recovery is not enabled, we should specify the recovery path using NM local
           // dirs, which is compatible with the old code.
           _recoveryPath = new Path(dir);
+          return f;
         } else {
-          // If NM recovery is enabled and the recovery file exists in old NM local dirs, which
-          // means old version of Spark already generated the recovery file, we should copy the
-          // old file in to a new recovery path for the compatibility.
-          if (!f.renameTo(new File(_recoveryPath.toUri().getPath(), RECOVERY_FILE_NAME))) {
-            // Fail to move recovery file to new path
-            logger.error("Failed to move recovery file {} to the path {}",
-              RECOVERY_FILE_NAME, _recoveryPath.toString());
+          // If the recovery path is set then either NM recovery is enabled or another recovery
+          // DB has been initialized. If NM recovery is enabled and had set the recovery path
+          // make sure to move all DBs to the recovery path from the old NM local dirs.
+          // If another DB was initialized first just make sure all the DBs are in the same
+          // location.
+          File newLoc = new File(_recoveryPath.toUri().getPath(), dbFileName);
+          if (!newLoc.equals(f)) {
+            try {
+              Files.move(f.toPath(), newLoc.toPath());
+            } catch (Exception e) {
+              // Fail to move recovery file to new path, just continue on with new DB location
+              logger.error("Failed to move recovery file {} to the path {}",
+                dbFileName, _recoveryPath.toString(), e);
+            }
           }
+          return newLoc;
         }
-        break;
       }
     }
-
     if (_recoveryPath == null) {
       _recoveryPath = new Path(localDirs[0]);
     }
 
-    return _recoveryPath;
+    return new File(_recoveryPath.toUri().getPath(), dbFileName);
   }
 
   /**
author	Thomas Graves <tgraves@prevailsail.corp.gq1.yahoo.com>	2016-09-09 13:43:32 -0500
committer	Tom Graves <tgraves@yahoo-inc.com>	2016-09-09 13:43:32 -0500
commit	a3981c28c956a82ccf5b1c61d45b6bd252d4abed (patch)
tree	72e420d0b8111a5c33a3d57804aa32c76531f672 /common
parent	7098a12945e71a159784836b75da855a603e1631 (diff)
download	spark-a3981c28c956a82ccf5b1c61d45b6bd252d4abed.tar.gz spark-a3981c28c956a82ccf5b1c61d45b6bd252d4abed.tar.bz2 spark-a3981c28c956a82ccf5b1c61d45b6bd252d4abed.zip