aboutsummaryrefslogtreecommitdiff
path: root/common/network-yarn/src/main
diff options
context:
space:
mode:
authorThomas Graves <tgraves@staydecay.corp.gq1.yahoo.com>2016-09-02 10:42:13 -0700
committerMarcelo Vanzin <vanzin@cloudera.com>2016-09-02 10:42:13 -0700
commite79962f2f3955485aecf32939207d8ee6ccd2704 (patch)
treeb2d3bdc02246f2293e8006d00c4b9af3c78e54d6 /common/network-yarn/src/main
parent419eefd811a4e29a73bc309157f150751e478db5 (diff)
downloadspark-e79962f2f3955485aecf32939207d8ee6ccd2704.tar.gz
spark-e79962f2f3955485aecf32939207d8ee6ccd2704.tar.bz2
spark-e79962f2f3955485aecf32939207d8ee6ccd2704.zip
[SPARK-16711] YarnShuffleService doesn't re-init properly on YARN rolling upgrade
The Spark Yarn Shuffle Service doesn't re-initialize the application credentials early enough which causes any other spark executors trying to fetch from that node during a rolling upgrade to fail with "java.lang.NullPointerException: Password cannot be null if SASL is enabled". Right now the spark shuffle service relies on the Yarn nodemanager to re-register the applications, unfortunately this is after we open the port for other executors to connect. If other executors connected before the re-register they get a null pointer exception which isn't a re-tryable exception and cause them to fail pretty quickly. To solve this I added another leveldb file so that it can save and re-initialize all the applications before opening the port for other executors to connect to it. Adding another leveldb was simpler from the code structure point of view. Most of the code changes are moving things to common util class. Patch was tested manually on a Yarn cluster with rolling upgrade was happing while spark job was running. Without the patch I consistently get the NullPointerException, with the patch the job gets a few Connection refused exceptions but the retries kick in and the it succeeds. Author: Thomas Graves <tgraves@staydecay.corp.gq1.yahoo.com> Closes #14718 from tgravescs/SPARK-16711.
Diffstat (limited to 'common/network-yarn/src/main')
-rw-r--r--common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java135
1 files changed, 127 insertions, 8 deletions
diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
index 2cf3f53e6d..df082e4a92 100644
--- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
+++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java
@@ -18,15 +18,28 @@
package org.apache.spark.network.yarn;
import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.nio.ByteBuffer;
import java.util.List;
+import java.util.Map;
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.server.api.*;
+import org.apache.spark.network.util.LevelDBProvider;
+import org.iq80.leveldb.DB;
+import org.iq80.leveldb.DBIterator;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -69,12 +82,26 @@ public class YarnShuffleService extends AuxiliaryService {
private static final boolean DEFAULT_SPARK_AUTHENTICATE = false;
private static final String RECOVERY_FILE_NAME = "registeredExecutors.ldb";
+ private static final String SECRETS_RECOVERY_FILE_NAME = "sparkShuffleRecovery.ldb";
// Whether failure during service initialization should stop the NM.
@VisibleForTesting
static final String STOP_ON_FAILURE_KEY = "spark.yarn.shuffle.stopOnFailure";
private static final boolean DEFAULT_STOP_ON_FAILURE = false;
+ // just for testing when you want to find an open port
+ @VisibleForTesting
+ static int boundPort = -1;
+ private static final ObjectMapper mapper = new ObjectMapper();
+ private static final String APP_CREDS_KEY_PREFIX = "AppCreds";
+ private static final LevelDBProvider.StoreVersion CURRENT_VERSION = new LevelDBProvider
+ .StoreVersion(1, 0);
+
+ // just for integration tests that want to look at this file -- in general not sensible as
+ // a static
+ @VisibleForTesting
+ static YarnShuffleService instance;
+
// An entity that manages the shuffle secret per application
// This is used only if authentication is enabled
private ShuffleSecretManager secretManager;
@@ -96,14 +123,11 @@ public class YarnShuffleService extends AuxiliaryService {
@VisibleForTesting
File registeredExecutorFile;
- // just for testing when you want to find an open port
+ // Where to store & reload application secrets for recovering state after an NM restart
@VisibleForTesting
- static int boundPort = -1;
+ File secretsFile;
- // just for integration tests that want to look at this file -- in general not sensible as
- // a static
- @VisibleForTesting
- static YarnShuffleService instance;
+ private DB db;
public YarnShuffleService() {
super("spark_shuffle");
@@ -143,10 +167,10 @@ public class YarnShuffleService extends AuxiliaryService {
// If authentication is enabled, set up the shuffle server to use a
// special RPC handler that filters out unauthenticated fetch requests
- boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
List<TransportServerBootstrap> bootstraps = Lists.newArrayList();
+ boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE);
if (authEnabled) {
- secretManager = new ShuffleSecretManager();
+ createSecretManager();
bootstraps.add(new SaslServerBootstrap(transportConf, secretManager));
}
@@ -170,6 +194,50 @@ public class YarnShuffleService extends AuxiliaryService {
}
}
+ private void createSecretManager() throws IOException {
+ secretManager = new ShuffleSecretManager();
+ secretsFile = new File(getRecoveryPath().toUri().getPath(), SECRETS_RECOVERY_FILE_NAME);
+
+ // Make sure this is protected in case its not in the NM recovery dir
+ FileSystem fs = FileSystem.getLocal(_conf);
+ fs.mkdirs(new Path(secretsFile.getPath()), new FsPermission((short)0700));
+
+ db = LevelDBProvider.initLevelDB(secretsFile, CURRENT_VERSION, mapper);
+ logger.info("Recovery location is: " + secretsFile.getPath());
+ if (db != null) {
+ logger.info("Going to reload spark shuffle data");
+ DBIterator itr = db.iterator();
+ itr.seek(APP_CREDS_KEY_PREFIX.getBytes(StandardCharsets.UTF_8));
+ while (itr.hasNext()) {
+ Map.Entry<byte[], byte[]> e = itr.next();
+ String key = new String(e.getKey(), StandardCharsets.UTF_8);
+ if (!key.startsWith(APP_CREDS_KEY_PREFIX)) {
+ break;
+ }
+ String id = parseDbAppKey(key);
+ ByteBuffer secret = mapper.readValue(e.getValue(), ByteBuffer.class);
+ logger.info("Reloading tokens for app: " + id);
+ secretManager.registerApp(id, secret);
+ }
+ }
+ }
+
+ private static String parseDbAppKey(String s) throws IOException {
+ if (!s.startsWith(APP_CREDS_KEY_PREFIX)) {
+ throw new IllegalArgumentException("expected a string starting with " + APP_CREDS_KEY_PREFIX);
+ }
+ String json = s.substring(APP_CREDS_KEY_PREFIX.length() + 1);
+ AppId parsed = mapper.readValue(json, AppId.class);
+ return parsed.appId;
+ }
+
+ private static byte[] dbAppKey(AppId appExecId) throws IOException {
+ // we stick a common prefix on all the keys so we can find them in the DB
+ String appExecJson = mapper.writeValueAsString(appExecId);
+ String key = (APP_CREDS_KEY_PREFIX + ";" + appExecJson);
+ return key.getBytes(StandardCharsets.UTF_8);
+ }
+
@Override
public void initializeApplication(ApplicationInitializationContext context) {
String appId = context.getApplicationId().toString();
@@ -177,6 +245,12 @@ public class YarnShuffleService extends AuxiliaryService {
ByteBuffer shuffleSecret = context.getApplicationDataForService();
logger.info("Initializing application {}", appId);
if (isAuthenticationEnabled()) {
+ AppId fullId = new AppId(appId);
+ if (db != null) {
+ byte[] key = dbAppKey(fullId);
+ byte[] value = mapper.writeValueAsString(shuffleSecret).getBytes(StandardCharsets.UTF_8);
+ db.put(key, value);
+ }
secretManager.registerApp(appId, shuffleSecret);
}
} catch (Exception e) {
@@ -190,6 +264,14 @@ public class YarnShuffleService extends AuxiliaryService {
try {
logger.info("Stopping application {}", appId);
if (isAuthenticationEnabled()) {
+ AppId fullId = new AppId(appId);
+ if (db != null) {
+ try {
+ db.delete(dbAppKey(fullId));
+ } catch (IOException e) {
+ logger.error("Error deleting {} from executor state db", appId, e);
+ }
+ }
secretManager.unregisterApp(appId);
}
blockHandler.applicationRemoved(appId, false /* clean up local dirs */);
@@ -222,6 +304,9 @@ public class YarnShuffleService extends AuxiliaryService {
if (blockHandler != null) {
blockHandler.close();
}
+ if (db != null) {
+ db.close();
+ }
} catch (Exception e) {
logger.error("Exception when stopping service", e);
}
@@ -275,4 +360,38 @@ public class YarnShuffleService extends AuxiliaryService {
return _recoveryPath;
}
+
+ /**
+ * Simply encodes an application ID.
+ */
+ public static class AppId {
+ public final String appId;
+
+ @JsonCreator
+ public AppId(@JsonProperty("appId") String appId) {
+ this.appId = appId;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ AppId appExecId = (AppId) o;
+ return Objects.equal(appId, appExecId.appId);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hashCode(appId);
+ }
+
+ @Override
+ public String toString() {
+ return Objects.toStringHelper(this)
+ .add("appId", appId)
+ .toString();
+ }
+ }
+
}