From e79962f2f3955485aecf32939207d8ee6ccd2704 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Fri, 2 Sep 2016 10:42:13 -0700 Subject: [SPARK-16711] YarnShuffleService doesn't re-init properly on YARN rolling upgrade The Spark Yarn Shuffle Service doesn't re-initialize the application credentials early enough which causes any other spark executors trying to fetch from that node during a rolling upgrade to fail with "java.lang.NullPointerException: Password cannot be null if SASL is enabled". Right now the spark shuffle service relies on the Yarn nodemanager to re-register the applications, unfortunately this is after we open the port for other executors to connect. If other executors connected before the re-register they get a null pointer exception which isn't a re-tryable exception and cause them to fail pretty quickly. To solve this I added another leveldb file so that it can save and re-initialize all the applications before opening the port for other executors to connect to it. Adding another leveldb was simpler from the code structure point of view. Most of the code changes are moving things to common util class. Patch was tested manually on a Yarn cluster with rolling upgrade was happing while spark job was running. Without the patch I consistently get the NullPointerException, with the patch the job gets a few Connection refused exceptions but the retries kick in and the it succeeds. Author: Thomas Graves Closes #14718 from tgravescs/SPARK-16711. --- .../apache/spark/network/yarn/YarnShuffleServiceSuite.scala | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'yarn') diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala index e123e78541..9a071862bb 100644 --- a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala @@ -33,6 +33,7 @@ import org.apache.hadoop.yarn.server.api.{ApplicationInitializationContext, Appl import org.scalatest.{BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.Eventually._ +import org.apache.spark.SecurityManager import org.apache.spark.SparkFunSuite import org.apache.spark.network.shuffle.ShuffleTestAccessor import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo @@ -77,6 +78,8 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd test("executor state kept across NM restart") { s1 = new YarnShuffleService + // set auth to true to test the secrets recovery + yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true) s1.init(yarnConfig) val app1Id = ApplicationId.newInstance(0, 1) val app1Data: ApplicationInitializationContext = @@ -89,6 +92,8 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd val execStateFile = s1.registeredExecutorFile execStateFile should not be (null) + val secretsFile = s1.secretsFile + secretsFile should not be (null) val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER) val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER) @@ -118,6 +123,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd s1.stop() s2 = new YarnShuffleService s2.init(yarnConfig) + s2.secretsFile should be (secretsFile) s2.registeredExecutorFile should be (execStateFile) val handler2 = s2.blockHandler @@ -135,6 +141,7 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd s3 = new YarnShuffleService s3.init(yarnConfig) s3.registeredExecutorFile should be (execStateFile) + s3.secretsFile should be (secretsFile) val handler3 = s3.blockHandler val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3) @@ -148,7 +155,10 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd test("removed applications should not be in registered executor file") { s1 = new YarnShuffleService + yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, false) s1.init(yarnConfig) + val secretsFile = s1.secretsFile + secretsFile should be (null) val app1Id = ApplicationId.newInstance(0, 1) val app1Data: ApplicationInitializationContext = new ApplicationInitializationContext("user", app1Id, null) -- cgit v1.2.3