[SPARK-13579][BUILD] Stop building the main Spark assembly.

This change modifies the "assembly/" module to just copy needed dependencies to its build directory, and modifies the packaging script to pick those up (and remove duplicate jars packages in the examples module). I also made some minor adjustments to dependencies to remove some test jars from the final packaging, and remove jars that conflict with each other when packaged separately (e.g. servlet api). Also note that this change restores guava in applications' classpaths, even though it's still shaded inside Spark. This is now needed for the Hadoop libraries that are packaged with Spark, which now are not processed by the shade plugin. Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #11796 from vanzin/SPARK-13579.
author: Marcelo Vanzin <vanzin@cloudera.com> 2016-04-04 16:52:21 -0700
committer: Josh Rosen <joshrosen@databricks.com> 2016-04-04 16:52:22 -0700
commit: 24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a (patch)
tree: 2069beb0e471afa4e1b1867efe786100b7f77f79
parent: 400b2f863ffaa01a34a8dae1541c61526fef908b (diff)
download: spark-24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a.tar.gz
spark-24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a.tar.bz2
spark-24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a.zip
26 files changed, 231 insertions, 319 deletions
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 477d4931c3..22cbac06ca 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -33,9 +33,8 @@
 
   <properties>
     <sbt.project.name>assembly</sbt.project.name>
-    <spark.jar.dir>scala-${scala.binary.version}</spark.jar.dir>
-    <spark.jar.basename>spark-assembly-${project.version}-hadoop${hadoop.version}.jar</spark.jar.basename>
-    <spark.jar>${project.build.directory}/${spark.jar.dir}/${spark.jar.basename}</spark.jar>
+    <build.testJarPhase>none</build.testJarPhase>
+    <build.copyDependenciesPhase>package</build.copyDependenciesPhase>
   </properties>
 
   <dependencies>
@@ -69,6 +68,17 @@
       <artifactId>spark-repl_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+
+    <!--
+      Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
+      that the libraries Spark depend on have it available. We'll package the version that Spark
+      uses (14.0.1) which is not the same as Hadoop dependencies, but works.
+    -->
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <scope>${hadoop.deps.scope}</scope>
+    </dependency>
   </dependencies>
 
   <build>
@@ -87,75 +97,26 @@
           <skip>true</skip>
         </configuration>
       </plugin>
-        <!-- zip pyspark archives to run python application on yarn mode -->
-        <plugin>
-          <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-antrun-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                  <goals>
-                    <goal>run</goal>
-                  </goals>
-              </execution>
-            </executions>
-            <configuration>
-              <target>
-                <delete dir="${basedir}/../python/lib/pyspark.zip"/>
-                <zip destfile="${basedir}/../python/lib/pyspark.zip">
-                  <fileset dir="${basedir}/../python/" includes="pyspark/**/*"/>
-                </zip>
-              </target>
-            </configuration>
-        </plugin>
-      <!-- Use the shade plugin to create a big JAR with all the dependencies -->
+      <!-- zip pyspark archives to run python application on yarn mode -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-shade-plugin</artifactId>
-        <configuration>
-          <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${spark.jar}</outputFile>
-          <artifactSet>
-            <includes>
-              <include>*:*</include>
-            </includes>
-          </artifactSet>
-          <filters>
-            <filter>
-              <artifact>*:*</artifact>
-              <excludes>
-                <exclude>org/datanucleus/**</exclude>
-                <exclude>META-INF/*.SF</exclude>
-                <exclude>META-INF/*.DSA</exclude>
-                <exclude>META-INF/*.RSA</exclude>
-              </excludes>
-            </filter>
-          </filters>
-        </configuration>
-        <executions>
-          <execution>
-            <phase>package</phase>
-            <goals>
-              <goal>shade</goal>
-            </goals>
-            <configuration>
-              <transformers>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
-                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
-                  <resource>META-INF/services/org.apache.hadoop.fs.FileSystem</resource>
-                </transformer>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
-                  <resource>reference.conf</resource>
-                </transformer>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
-                  <resource>log4j.properties</resource>
-                </transformer>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
-              </transformers>
-            </configuration>
-          </execution>
-        </executions>
+          <artifactId>maven-antrun-plugin</artifactId>
+          <executions>
+            <execution>
+              <phase>package</phase>
+                <goals>
+                  <goal>run</goal>
+                </goals>
+            </execution>
+          </executions>
+          <configuration>
+            <target>
+              <delete dir="${basedir}/../python/lib/pyspark.zip"/>
+              <zip destfile="${basedir}/../python/lib/pyspark.zip">
+                <fileset dir="${basedir}/../python/" includes="pyspark/**/*"/>
+              </zip>
+            </target>
+          </configuration>
       </plugin>
     </plugins>
   </build>
diff --git a/bin/spark-class b/bin/spark-class
index e710e388be..b489591778 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -36,21 +36,20 @@ else
 fi
 
 # Find Spark jars.
-# TODO: change the directory name when Spark jars move from "lib".
 if [ -f "${SPARK_HOME}/RELEASE" ]; then
-  SPARK_JARS_DIR="${SPARK_HOME}/lib"
+  SPARK_JARS_DIR="${SPARK_HOME}/jars"
 else
-  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION"
+  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
 fi
 
-if [ ! -d "$SPARK_JARS_DIR" ]; then
+if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then
   echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2
   echo "You need to build Spark before running this program." 1>&2
   exit 1
+else
+  LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"
 fi
 
-LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*"
-
 # Add the launcher build dir to the classpath if requested.
 if [ -n "$SPARK_PREPEND_CLASSES" ]; then
   LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
index 565b87c102..579efff909 100644
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -29,11 +29,10 @@ if "x%1"=="x" (
 )
 
 rem Find Spark jars.
-rem TODO: change the directory name when Spark jars move from "lib".
 if exist "%SPARK_HOME%\RELEASE" (
-  set SPARK_JARS_DIR="%SPARK_HOME%\lib"
+  set SPARK_JARS_DIR="%SPARK_HOME%\jars"
 ) else (
-  set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%"
+  set SPARK_JARS_DIR="%SPARK_HOME%\assembly\target\scala-%SPARK_SCALA_VERSION%\jars"
 )
 
 if not exist "%SPARK_JARS_DIR%"\ (
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 50bcf85805..c304629bcd 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1121,9 +1121,9 @@ private[spark] object Utils extends Logging {
       extraEnvironment: Map[String, String] = Map.empty,
       redirectStderr: Boolean = true): String = {
     val process = executeCommand(command, workingDir, extraEnvironment, redirectStderr)
-    val output = new StringBuffer
+    val output = new StringBuilder
     val threadName = "read stdout for " + command(0)
-    def appendToOutput(s: String): Unit = output.append(s)
+    def appendToOutput(s: String): Unit = output.append(s).append("\n")
     val stdoutThread = processStreamByLine(threadName, process.getInputStream, appendToOutput)
     val exitCode = process.waitFor()
     stdoutThread.join()   // Wait for it to finish reading output
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index 280e496498..4fa9f9a8f5 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -201,24 +201,29 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
 
     // Make sure only logging errors
     val logger = Logger.getRootLogger
+    val oldLogLevel = logger.getLevel
     logger.setLevel(Level.ERROR)
-    logger.addAppender(mockAppender)
+    try {
+      logger.addAppender(mockAppender)
 
-    val testOutputStream = new PipedOutputStream()
-    val testInputStream = new PipedInputStream(testOutputStream)
+      val testOutputStream = new PipedOutputStream()
+      val testInputStream = new PipedInputStream(testOutputStream)
 
-    // Close the stream before appender tries to read will cause an IOException
-    testInputStream.close()
-    testOutputStream.close()
-    val appender = FileAppender(testInputStream, testFile, new SparkConf)
+      // Close the stream before appender tries to read will cause an IOException
+      testInputStream.close()
+      testOutputStream.close()
+      val appender = FileAppender(testInputStream, testFile, new SparkConf)
 
-    appender.awaitTermination()
+      appender.awaitTermination()
 
-    // If InputStream was closed without first stopping the appender, an exception will be logged
-    verify(mockAppender, atLeast(1)).doAppend(loggingEventCaptor.capture)
-    val loggingEvent = loggingEventCaptor.getValue
-    assert(loggingEvent.getThrowableInformation !== null)
-    assert(loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
+      // If InputStream was closed without first stopping the appender, an exception will be logged
+      verify(mockAppender, atLeast(1)).doAppend(loggingEventCaptor.capture)
+      val loggingEvent = loggingEventCaptor.getValue
+      assert(loggingEvent.getThrowableInformation !== null)
+      assert(loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
+    } finally {
+      logger.setLevel(oldLogLevel)
+    }
   }
 
   test("file appender async close stream gracefully") {
@@ -228,30 +233,35 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
 
     // Make sure only logging errors
     val logger = Logger.getRootLogger
+    val oldLogLevel = logger.getLevel
     logger.setLevel(Level.ERROR)
-    logger.addAppender(mockAppender)
+    try {
+      logger.addAppender(mockAppender)
 
-    val testOutputStream = new PipedOutputStream()
-    val testInputStream = new PipedInputStream(testOutputStream) with LatchedInputStream
+      val testOutputStream = new PipedOutputStream()
+      val testInputStream = new PipedInputStream(testOutputStream) with LatchedInputStream
 
-    // Close the stream before appender tries to read will cause an IOException
-    testInputStream.close()
-    testOutputStream.close()
-    val appender = FileAppender(testInputStream, testFile, new SparkConf)
+      // Close the stream before appender tries to read will cause an IOException
+      testInputStream.close()
+      testOutputStream.close()
+      val appender = FileAppender(testInputStream, testFile, new SparkConf)
 
-    // Stop the appender before an IOException is called during read
-    testInputStream.latchReadStarted.await()
-    appender.stop()
-    testInputStream.latchReadProceed.countDown()
+      // Stop the appender before an IOException is called during read
+      testInputStream.latchReadStarted.await()
+      appender.stop()
+      testInputStream.latchReadProceed.countDown()
 
-    appender.awaitTermination()
+      appender.awaitTermination()
 
-    // Make sure no IOException errors have been logged as a result of appender closing gracefully
-    verify(mockAppender, atLeast(0)).doAppend(loggingEventCaptor.capture)
-    import scala.collection.JavaConverters._
-    loggingEventCaptor.getAllValues.asScala.foreach { loggingEvent =>
-      assert(loggingEvent.getThrowableInformation === null
-        || !loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
+      // Make sure no IOException errors have been logged as a result of appender closing gracefully
+      verify(mockAppender, atLeast(0)).doAppend(loggingEventCaptor.capture)
+      import scala.collection.JavaConverters._
+      loggingEventCaptor.getAllValues.asScala.foreach { loggingEvent =>
+        assert(loggingEvent.getThrowableInformation === null
+          || !loggingEvent.getThrowableInformation.getThrowable.isInstanceOf[IOException])
+      }
+    } finally {
+      logger.setLevel(oldLogLevel)
     }
   }
 
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index 3865a9fb16..2c24366cc3 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -12,7 +12,6 @@ asm-3.1.jar
 asm-commons-3.1.jar
 asm-tree-3.1.jar
 avro-1.7.7.jar
-avro-ipc-1.7.7-tests.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 bonecp-0.8.0.RELEASE.jar
@@ -61,6 +60,7 @@ grizzly-http-2.1.2.jar
 grizzly-http-server-2.1.2.jar
 grizzly-http-servlet-2.1.2.jar
 grizzly-rcm-2.1.2.jar
+guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
 hadoop-annotations-2.2.0.jar
@@ -164,7 +164,6 @@ scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.2.jar
 scalap-2.11.8.jar
-servlet-api-2.5.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
 snappy-0.2.jar
@@ -177,7 +176,6 @@ stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-1.5.6.jar
-unused-1.0.0.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar
 xz-1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 4313799da7..e9cb0d8f3e 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -12,7 +12,6 @@ asm-3.1.jar
 asm-commons-3.1.jar
 asm-tree-3.1.jar
 avro-1.7.7.jar
-avro-ipc-1.7.7-tests.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
@@ -56,6 +55,7 @@ eigenbase-properties-1.1.5.jar
 geronimo-annotation_1.0_spec-1.1.1.jar
 geronimo-jaspic_1.0_spec-1.0.jar
 geronimo-jta_1.1_spec-1.1.1.jar
+guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
 hadoop-annotations-2.3.0.jar
@@ -155,7 +155,6 @@ scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.2.jar
 scalap-2.11.8.jar
-servlet-api-2.5.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
 snappy-0.2.jar
@@ -168,7 +167,6 @@ stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-1.5.6.jar
-unused-1.0.0.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar
 xz-1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 910ea685f2..d8d1840da5 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -12,7 +12,6 @@ asm-3.1.jar
 asm-commons-3.1.jar
 asm-tree-3.1.jar
 avro-1.7.7.jar
-avro-ipc-1.7.7-tests.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
@@ -56,6 +55,7 @@ eigenbase-properties-1.1.5.jar
 geronimo-annotation_1.0_spec-1.1.1.jar
 geronimo-jaspic_1.0_spec-1.0.jar
 geronimo-jta_1.1_spec-1.1.1.jar
+guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
 hadoop-annotations-2.4.0.jar
@@ -156,7 +156,6 @@ scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.2.jar
 scalap-2.11.8.jar
-servlet-api-2.5.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
 snappy-0.2.jar
@@ -169,7 +168,6 @@ stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-1.5.6.jar
-unused-1.0.0.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar
 xz-1.0.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 0692f24e47..8beede1e38 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -16,7 +16,6 @@ asm-3.1.jar
 asm-commons-3.1.jar
 asm-tree-3.1.jar
 avro-1.7.7.jar
-avro-ipc-1.7.7-tests.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
@@ -61,6 +60,7 @@ geronimo-annotation_1.0_spec-1.1.1.jar
 geronimo-jaspic_1.0_spec-1.0.jar
 geronimo-jta_1.1_spec-1.1.1.jar
 gson-2.2.4.jar
+guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
 hadoop-annotations-2.6.0.jar
@@ -162,7 +162,6 @@ scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.2.jar
 scalap-2.11.8.jar
-servlet-api-2.5.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
 snappy-0.2.jar
@@ -175,7 +174,6 @@ stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-1.5.6.jar
-unused-1.0.0.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index e397558e05..a9d814f944 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -16,7 +16,6 @@ asm-3.1.jar
 asm-commons-3.1.jar
 asm-tree-3.1.jar
 avro-1.7.7.jar
-avro-ipc-1.7.7-tests.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
@@ -61,6 +60,7 @@ geronimo-annotation_1.0_spec-1.1.1.jar
 geronimo-jaspic_1.0_spec-1.0.jar
 geronimo-jta_1.1_spec-1.1.1.jar
 gson-2.2.4.jar
+guava-14.0.1.jar
 guice-3.0.jar
 guice-servlet-3.0.jar
 hadoop-annotations-2.7.0.jar
@@ -163,7 +163,6 @@ scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.2.jar
 scalap-2.11.8.jar
-servlet-api-2.5.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
 snappy-0.2.jar
@@ -176,7 +175,6 @@ stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
 univocity-parsers-1.5.6.jar
-unused-1.0.0.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index dbdd42ff9e..4f7544f6ea 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -160,28 +160,35 @@ echo -e "\$ ${BUILD_COMMAND[@]}\n"
 
 # Make directories
 rm -rf "$DISTDIR"
-mkdir -p "$DISTDIR/lib"
+mkdir -p "$DISTDIR/jars"
 echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE"
 echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 
 # Copy jars
-cp "$SPARK_HOME"/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/"
-# This will fail if the -Pyarn profile is not provided
-# In this case, silence the error and ignore the return code of this command
-cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/lib/" &> /dev/null || :
+cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
+
+# Only create the yarn directory if the yarn artifacts were build.
+if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
+  mkdir "$DISTDIR"/yarn
+  cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn"
+fi
 
 # Copy examples and dependencies
 mkdir -p "$DISTDIR/examples/jars"
 cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars"
 
+# Deduplicate jars that have already been packaged as part of the main Spark dependencies.
+for f in "$DISTDIR/examples/jars/"*; do
+  name=$(basename "$f")
+  if [ -f "$DISTDIR/jars/$name" ]; then
+    rm "$DISTDIR/examples/jars/$name"
+  fi
+done
+
 # Copy example sources (needed for python and SQL)
 mkdir -p "$DISTDIR/examples/src/main"
 cp -r "$SPARK_HOME"/examples/src/main "$DISTDIR/examples/src/"
 
-if [ "$SPARK_HIVE" == "1" ]; then
-  cp "$SPARK_HOME"/lib_managed/jars/datanucleus*.jar "$DISTDIR/lib/"
-fi
-
 # Copy license and ASF files
 cp "$SPARK_HOME/LICENSE" "$DISTDIR"
 cp -r "$SPARK_HOME/licenses" "$DISTDIR"
diff --git a/dev/mima b/dev/mima
index ea746e6f01..c355349045 100755
--- a/dev/mima
+++ b/dev/mima
@@ -25,8 +25,8 @@ FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
 SPARK_PROFILES="-Pyarn -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
-TOOLS_CLASSPATH="$(build/sbt "export tools/fullClasspath" | tail -n1)"
-OLD_DEPS_CLASSPATH="$(build/sbt $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
+TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
+OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
 rm -f .generated-mima*
 
@@ -36,7 +36,7 @@ java \
   -cp "$TOOLS_CLASSPATH:$OLD_DEPS_CLASSPATH" \
   org.apache.spark.tools.GenerateMIMAIgnore
 
-echo -e "q\n" | build/sbt mimaReportBinaryIssues | grep -v -e "info.*Resolving"
+echo -e "q\n" | build/sbt -DcopyDependencies=false "$@" mimaReportBinaryIssues | grep -v -e "info.*Resolving"
 ret_val=$?
 
 if [ $ret_val != 0 ]; then
diff --git a/dev/run-tests.py b/dev/run-tests.py
index c2944747ee..cbe347274e 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -350,7 +350,7 @@ def build_spark_sbt(hadoop_version):
 def build_spark_assembly_sbt(hadoop_version):
     # Enable all of the profiles for the build:
     build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
-    sbt_goals = ["assembly/assembly"]
+    sbt_goals = ["assembly/package"]
     profiles_and_goals = build_profiles + sbt_goals
     print("[info] Building Spark assembly (w/Hive 1.2.1) using SBT with these arguments: ",
           " ".join(profiles_and_goals))
@@ -371,9 +371,10 @@ def build_apache_spark(build_tool, hadoop_version):
         build_spark_sbt(hadoop_version)
 
 
-def detect_binary_inop_with_mima():
+def detect_binary_inop_with_mima(hadoop_version):
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
     set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
-    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
+    run_cmd([os.path.join(SPARK_HOME, "dev", "mima")] + build_profiles)
 
 
 def run_scala_tests_maven(test_profiles):
@@ -571,8 +572,8 @@ def main():
     # backwards compatibility checks
     if build_tool == "sbt":
         # Note: compatibility tests only supported in sbt for now
-        detect_binary_inop_with_mima()
-        # Since we did not build assembly/assembly before running dev/mima, we need to
+        detect_binary_inop_with_mima(hadoop_version)
+        # Since we did not build assembly/package before running dev/mima, we need to
         # do it here because the tests still rely on it; see SPARK-13294 for details.
         build_spark_assembly_sbt(hadoop_version)
 
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 2fdc97f8a0..274a8edb0c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1687,12 +1687,7 @@ on all of the worker nodes, as they will need access to the Hive serialization a
 (SerDes) in order to access data stored in Hive.
 
 Configuration of Hive is done by placing your `hive-site.xml`, `core-site.xml` (for security configuration),
- `hdfs-site.xml` (for HDFS configuration) file in `conf/`. Please note when running
-the query on a YARN cluster (`cluster` mode), the `datanucleus` jars under the `lib` directory
-and `hive-site.xml` under `conf/` directory need to be available on the driver and all executors launched by the
-YARN cluster. The convenient way to do this is adding them through the `--jars` option and `--file` option of the
-`spark-submit` command.
-
+`hdfs-site.xml` (for HDFS configuration) file in `conf/`.
 
 <div class="codetabs">
 
diff --git a/examples/pom.xml b/examples/pom.xml
index b7f37978b9..4a20370f06 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -27,13 +27,16 @@
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-examples_2.11</artifactId>
-  <properties>
-    <sbt.project.name>examples</sbt.project.name>
-  </properties>
   <packaging>jar</packaging>
   <name>Spark Project Examples</name>
   <url>http://spark.apache.org/</url>
 
+  <properties>
+    <sbt.project.name>examples</sbt.project.name>
+    <build.testJarPhase>none</build.testJarPhase>
+    <build.copyDependenciesPhase>package</build.copyDependenciesPhase>
+  </properties>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -77,23 +80,6 @@
     </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
-      <artifactId>hbase-testing-util</artifactId>
-      <version>${hbase.version}</version>
-      <scope>${hbase.deps.scope}</scope>
-      <exclusions>
-        <exclusion>
-          <!-- SPARK-4455 -->
-          <groupId>org.apache.hbase</groupId>
-          <artifactId>hbase-annotations</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.jruby</groupId>
-          <artifactId>jruby-complete</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-protocol</artifactId>
       <version>${hbase.version}</version>
       <scope>${hbase.deps.scope}</scope>
@@ -140,6 +126,10 @@
           <artifactId>hbase-annotations</artifactId>
         </exclusion>
         <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-common</artifactId>
+        </exclusion>
+        <exclusion>
           <groupId>org.apache.hadoop</groupId>
           <artifactId>hadoop-core</artifactId>
         </exclusion>
@@ -209,13 +199,6 @@
       <scope>${hbase.deps.scope}</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.hbase</groupId>
-      <artifactId>hbase-hadoop-compat</artifactId>
-      <version>${hbase.version}</version>
-      <type>test-jar</type>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-math3</artifactId>
       <scope>provided</scope>
@@ -294,17 +277,6 @@
       <artifactId>scopt_${scala.binary.version}</artifactId>
       <version>3.3.0</version>
     </dependency>
-
-    <!--
-      The following dependencies are already present in the Spark assembly, so we want to force
-      them to be provided.
-    -->
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <scope>provided</scope>
-    </dependency>
-
   </dependencies>
 
   <build>
@@ -325,38 +297,6 @@
           <skip>true</skip>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-jar-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>prepare-test-jar</id>
-            <phase>none</phase>
-            <goals>
-              <goal>test-jar</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <outputDirectory>${jars.target.dir}</outputDirectory>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-        <executions>
-          <execution>
-            <phase>package</phase>
-            <goals>
-              <goal>copy-dependencies</goal>
-            </goals>
-            <configuration>
-              <includeScope>runtime</includeScope>
-              <outputDirectory>${jars.target.dir}</outputDirectory>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
     </plugins>
   </build>
   <profiles>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index d02b2a4994..7a5e37c501 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -144,10 +144,26 @@ abstract class AbstractCommandBuilder {
     boolean isTesting = "1".equals(getenv("SPARK_TESTING"));
     if (prependClasses || isTesting) {
       String scala = getScalaVersion();
-      List<String> projects = Arrays.asList("core", "repl", "mllib", "graphx",
-        "streaming", "tools", "sql/catalyst", "sql/core", "sql/hive", "sql/hive-thriftserver",
-        "yarn", "launcher",
-        "common/network-common", "common/network-shuffle", "common/network-yarn");
+      List<String> projects = Arrays.asList(
+        "common/network-common",
+        "common/network-shuffle",
+        "common/network-yarn",
+        "common/sketch",
+        "common/tags",
+        "common/unsafe",
+        "core",
+        "examples",
+        "graphx",
+        "launcher",
+        "mllib",
+        "repl",
+        "sql/catalyst",
+        "sql/core",
+        "sql/hive",
+        "sql/hive-thriftserver",
+        "streaming",
+        "yarn"
+      );
       if (prependClasses) {
         if (!isTesting) {
           System.err.println(
@@ -174,31 +190,12 @@ abstract class AbstractCommandBuilder {
     // Add Spark jars to the classpath. For the testing case, we rely on the test code to set and
     // propagate the test classpath appropriately. For normal invocation, look for the jars
     // directory under SPARK_HOME.
-    String jarsDir = findJarsDir(getSparkHome(), getScalaVersion(), !isTesting);
+    boolean isTestingSql = "1".equals(getenv("SPARK_SQL_TESTING"));
+    String jarsDir = findJarsDir(getSparkHome(), getScalaVersion(), !isTesting && !isTestingSql);
     if (jarsDir != null) {
       addToClassPath(cp, join(File.separator, jarsDir, "*"));
     }
 
-    // Datanucleus jars must be included on the classpath. Datanucleus jars do not work if only
-    // included in the uber jar as plugin.xml metadata is lost. Both sbt and maven will populate
-    // "lib_managed/jars/" with the datanucleus jars when Spark is built with Hive
-    File libdir;
-    if (new File(sparkHome, "RELEASE").isFile()) {
-      libdir = new File(sparkHome, "lib");
-    } else {
-      libdir = new File(sparkHome, "lib_managed/jars");
-    }
-
-    if (libdir.isDirectory()) {
-      for (File jar : libdir.listFiles()) {
-        if (jar.getName().startsWith("datanucleus-")) {
-          addToClassPath(cp, jar.getAbsolutePath());
-        }
-      }
-    } else {
-      checkState(isTesting, "Library directory '%s' does not exist.", libdir.getAbsolutePath());
-    }
-
     addToClassPath(cp, getenv("HADOOP_CONF_DIR"));
     addToClassPath(cp, getenv("YARN_CONF_DIR"));
     addToClassPath(cp, getenv("SPARK_DIST_CLASSPATH"));
diff --git a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
index a08c8dcba4..91586aad7b 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/CommandBuilderUtils.java
@@ -358,12 +358,12 @@ class CommandBuilderUtils {
     // TODO: change to the correct directory once the assembly build is changed.
     File libdir;
     if (new File(sparkHome, "RELEASE").isFile()) {
-      libdir = new File(sparkHome, "lib");
+      libdir = new File(sparkHome, "jars");
       checkState(!failIfNotFound || libdir.isDirectory(),
         "Library directory '%s' does not exist.",
         libdir.getAbsolutePath());
     } else {
-      libdir = new File(sparkHome, String.format("assembly/target/scala-%s", scalaVersion));
+      libdir = new File(sparkHome, String.format("assembly/target/scala-%s/jars", scalaVersion));
       if (!libdir.isDirectory()) {
         checkState(!failIfNotFound,
           "Library directory '%s' does not exist; make sure Spark is built.",
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index 56e4107c5a..c31c42cd3a 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -336,6 +336,7 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
   }
 
   private List<String> findExamplesJars() {
+    boolean isTesting = "1".equals(getenv("SPARK_TESTING"));
     List<String> examplesJars = new ArrayList<>();
     String sparkHome = getSparkHome();
 
@@ -346,11 +347,15 @@ class SparkSubmitCommandBuilder extends AbstractCommandBuilder {
       jarsDir = new File(sparkHome,
         String.format("examples/target/scala-%s/jars", getScalaVersion()));
     }
-    checkState(jarsDir.isDirectory(), "Examples jars directory '%s' does not exist.",
+
+    boolean foundDir = jarsDir.isDirectory();
+    checkState(isTesting || foundDir, "Examples jars directory '%s' does not exist.",
         jarsDir.getAbsolutePath());
 
-    for (File f: jarsDir.listFiles()) {
-      examplesJars.add(f.getAbsolutePath());
+    if (foundDir) {
+      for (File f: jarsDir.listFiles()) {
+        examplesJars.add(f.getAbsolutePath());
+      }
     }
     return examplesJars;
   }
diff --git a/pom.xml b/pom.xml
index e135c92c07..984b2859ef 100644
--- a/pom.xml
+++ b/pom.xml
@@ -185,6 +185,10 @@
     <!-- Modules that copy jars to the build directory should do so under this location. -->
     <jars.target.dir>${project.build.directory}/scala-${scala.binary.version}/jars</jars.target.dir>
 
+    <!-- Allow modules to enable / disable certain build plugins easily. -->
+    <build.testJarPhase>prepare-package</build.testJarPhase>
+    <build.copyDependenciesPhase>none</build.copyDependenciesPhase>
+
     <!--
       Dependency scopes that can be overridden by enabling certain profiles. These profiles are
       declared in the projects that build assemblies.
@@ -238,15 +242,6 @@
   </pluginRepositories>
   <dependencies>
     <!--
-      This is a dummy dependency that is used along with the shading plug-in
-      to create effective poms on publishing (see SPARK-3812).
-    -->
-    <dependency>
-      <groupId>org.spark-project.spark</groupId>
-      <artifactId>unused</artifactId>
-      <version>1.0.0</version>
-    </dependency>
-    <!--
          This is needed by the scalatest plugin, and so is declared here to be available in
          all child modules, just as scalatest is run in all children
     -->
@@ -833,6 +828,14 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <!-- avro-mapred for some reason depends on avro-ipc's test jar, so undo that. -->
+      <dependency>
+        <groupId>org.apache.avro</groupId>
+        <artifactId>avro-ipc</artifactId>
+        <classifier>tests</classifier>
+        <version>${avro.version}</version>
+        <scope>test</scope>
+      </dependency>
       <dependency>
         <groupId>org.apache.avro</groupId>
         <artifactId>avro-mapred</artifactId>
@@ -1521,6 +1524,10 @@
             <groupId>org.codehaus.groovy</groupId>
             <artifactId>groovy-all</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>javax.servlet</groupId>
+            <artifactId>servlet-api</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
 
@@ -1916,6 +1923,7 @@
               -->
               <SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
               <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
+              <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION>
               <SPARK_TESTING>1</SPARK_TESTING>
               <JAVA_HOME>${test.java.home}</JAVA_HOME>
             </environmentVariables>
@@ -1964,6 +1972,7 @@
               -->
               <SPARK_DIST_CLASSPATH>${test_classpath}</SPARK_DIST_CLASSPATH>
               <SPARK_PREPEND_CLASSES>1</SPARK_PREPEND_CLASSES>
+              <SPARK_SCALA_VERSION>${scala.binary.version}</SPARK_SCALA_VERSION>
               <SPARK_TESTING>1</SPARK_TESTING>
               <JAVA_HOME>${test.java.home}</JAVA_HOME>
             </environmentVariables>
@@ -2146,6 +2155,7 @@
         <version>2.10</version>
         <executions>
           <execution>
+            <id>generate-test-classpath</id>
             <phase>test-compile</phase>
             <goals>
               <goal>build-classpath</goal>
@@ -2155,6 +2165,17 @@
               <outputProperty>test_classpath</outputProperty>
             </configuration>
           </execution>
+          <execution>
+            <id>copy-module-dependencies</id>
+            <phase>${build.copyDependenciesPhase}</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <includeScope>runtime</includeScope>
+              <outputDirectory>${jars.target.dir}</outputDirectory>
+            </configuration>
+          </execution>
         </executions>
       </plugin>
 
@@ -2169,9 +2190,6 @@
           <shadedArtifactAttached>false</shadedArtifactAttached>
           <artifactSet>
             <includes>
-              <!-- At a minimum we must include this to force effective pom generation -->
-              <include>org.spark-project.spark:unused</include>
-
               <include>org.eclipse.jetty:jetty-io</include>
               <include>org.eclipse.jetty:jetty-http</include>
               <include>org.eclipse.jetty:jetty-continuation</include>
@@ -2302,7 +2320,7 @@
         <executions>
           <execution>
             <id>prepare-test-jar</id>
-            <phase>prepare-package</phase>
+            <phase>${build.testJarPhase}</phase>
             <goals>
               <goal>test-jar</goal>
             </goals>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 5d62b688b9..b32480b164 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -57,11 +57,12 @@ object BuildCommons {
     Seq("yarn", "java8-tests", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests").map(ProjectRef(buildLocation, _))
 
-  val assemblyProjects@Seq(assembly, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKinesisAslAssembly) =
-    Seq("assembly", "network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-kinesis-asl-assembly")
+  val assemblyProjects@Seq(networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingKinesisAslAssembly) =
+    Seq("network-yarn", "streaming-flume-assembly", "streaming-kafka-assembly", "streaming-kinesis-asl-assembly")
       .map(ProjectRef(buildLocation, _))
 
-  val copyJarsProjects@Seq(examples) = Seq("examples").map(ProjectRef(buildLocation, _))
+  val copyJarsProjects@Seq(assembly, examples) = Seq("assembly", "examples")
+    .map(ProjectRef(buildLocation, _))
 
   val tools = ProjectRef(buildLocation, "tools")
   // Root project.
@@ -263,8 +264,14 @@ object SparkBuild extends PomBuild {
   /* Unsafe settings */
   enable(Unsafe.settings)(unsafe)
 
-  /* Set up tasks to copy dependencies during packaging. */
-  copyJarsProjects.foreach(enable(CopyDependencies.settings))
+  /*
+   * Set up tasks to copy dependencies during packaging. This step can be disabled in the command
+   * line, so that dev/mima can run without trying to copy these files again and potentially
+   * causing issues.
+   */
+  if (!"false".equals(System.getProperty("copyDependencies"))) {
+    copyJarsProjects.foreach(enable(CopyDependencies.settings))
+  }
 
   /* Enable Assembly for all assembly projects */
   assemblyProjects.foreach(enable(Assembly.settings))
@@ -477,8 +484,6 @@ object Assembly {
 
   val hadoopVersion = taskKey[String]("The version of hadoop that spark is compiled against.")
 
-  val deployDatanucleusJars = taskKey[Unit]("Deploy datanucleus jars to the spark/lib_managed/jars directory")
-
   lazy val settings = assemblySettings ++ Seq(
     test in assembly := {},
     hadoopVersion := {
@@ -497,27 +502,13 @@ object Assembly {
       s"${mName}-test-${v}.jar"
     },
     mergeStrategy in assembly := {
-      case PathList("org", "datanucleus", xs @ _*)             => MergeStrategy.discard
       case m if m.toLowerCase.endsWith("manifest.mf")          => MergeStrategy.discard
       case m if m.toLowerCase.matches("meta-inf.*\\.sf$")      => MergeStrategy.discard
       case "log4j.properties"                                  => MergeStrategy.discard
       case m if m.toLowerCase.startsWith("meta-inf/services/") => MergeStrategy.filterDistinctLines
       case "reference.conf"                                    => MergeStrategy.concat
       case _                                                   => MergeStrategy.first
-    },
-    deployDatanucleusJars := {
-      val jars: Seq[File] = (fullClasspath in assembly).value.map(_.data)
-        .filter(_.getPath.contains("org.datanucleus"))
-      var libManagedJars = new File(BuildCommons.sparkHome, "lib_managed/jars")
-      libManagedJars.mkdirs()
-      jars.foreach { jar =>
-        val dest = new File(libManagedJars, jar.getName)
-        if (!dest.exists()) {
-          Files.copy(jar.toPath, dest.toPath)
-        }
-      }
-    },
-    assembly <<= assembly.dependsOn(deployDatanucleusJars)
+    }
   )
 }
 
@@ -698,6 +689,13 @@ object Java8TestSettings {
 object TestSettings {
   import BuildCommons._
 
+  private val scalaBinaryVersion =
+    if (System.getProperty("scala-2.10") == "true") {
+      "2.10"
+    } else {
+      "2.11"
+    }
+
   lazy val settings = Seq (
     // Fork new JVMs for tests and set Java options for those
     fork := true,
@@ -707,6 +705,7 @@ object TestSettings {
       "SPARK_DIST_CLASSPATH" ->
         (fullClasspath in Test).value.files.map(_.getAbsolutePath).mkString(":").stripSuffix(":"),
       "SPARK_PREPEND_CLASSES" -> "1",
+      "SPARK_SCALA_VERSION" -> scalaBinaryVersion,
       "SPARK_TESTING" -> "1",
       "JAVA_HOME" -> sys.env.get("JAVA_HOME").getOrElse(sys.props("java.home"))),
     javaOptions in Test += s"-Djava.io.tmpdir=$testTempDir",
@@ -744,7 +743,7 @@ object TestSettings {
     // Make sure the test temp directory exists.
     resourceGenerators in Test <+= resourceManaged in Test map { outDir: File =>
       if (!new File(testTempDir).isDirectory()) {
-        require(new File(testTempDir).mkdirs())
+        require(new File(testTempDir).mkdirs(), s"Error creating temp directory $testTempDir.")
       }
       Seq[File]()
     },
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index d010c0e008..148bf7e8ff 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -1482,7 +1482,7 @@ def search_kafka_assembly_jar():
         raise Exception(
             ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
             "You need to build Spark with "
-            "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
+            "'build/sbt assembly/package streaming-kafka-assembly/assembly' or "
             "'build/mvn package' before running this test.")
     elif len(jars) > 1:
         raise Exception(("Found multiple Spark Streaming Kafka assembly JARs: %s; please "
@@ -1548,7 +1548,7 @@ if __name__ == "__main__":
     elif are_kinesis_tests_enabled is False:
         sys.stderr.write("Skipping all Kinesis Python tests as the optional Kinesis project was "
                          "not compiled into a JAR. To run these tests, "
-                         "you need to build Spark with 'build/sbt -Pkinesis-asl assembly/assembly "
+                         "you need to build Spark with 'build/sbt -Pkinesis-asl assembly/package "
                          "streaming-kinesis-asl-assembly/assembly' or "
                          "'build/mvn -Pkinesis-asl package' before running this test.")
     else:
@@ -1556,7 +1556,7 @@ if __name__ == "__main__":
             ("Failed to find Spark Streaming Kinesis assembly jar in %s. "
              % kinesis_asl_assembly_dir) +
             "You need to build Spark with 'build/sbt -Pkinesis-asl "
-            "assembly/assembly streaming-kinesis-asl-assembly/assembly'"
+            "assembly/package streaming-kinesis-asl-assembly/assembly'"
             "or 'build/mvn -Pkinesis-asl package' before running this test.")
 
     sys.stderr.write("Running tests: %s \n" % (str(testcases)))
diff --git a/python/run-tests.py b/python/run-tests.py
index a9f8854e6f..38b3bb84c1 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -53,11 +53,25 @@ LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
 FAILURE_REPORTING_LOCK = Lock()
 LOGGER = logging.getLogger()
 
+# Find out where the assembly jars are located.
+for scala in ["2.11", "2.10"]:
+    build_dir = os.path.join(SPARK_HOME, "assembly", "target", "scala-" + scala)
+    if os.path.isdir(build_dir):
+        SPARK_DIST_CLASSPATH = os.path.join(build_dir, "jars", "*")
+        break
+else:
+    raise Exception("Cannot find assembly build directory, please build Spark first.")
+
 
 def run_individual_python_test(test_name, pyspark_python):
     env = dict(os.environ)
-    env.update({'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python),
-                'PYSPARK_DRIVER_PYTHON': which(pyspark_python)})
+    env.update({
+        'SPARK_DIST_CLASSPATH': SPARK_DIST_CLASSPATH,
+        'SPARK_TESTING': '1',
+        'SPARK_PREPEND_CLASSES': '1',
+        'PYSPARK_PYTHON': which(pyspark_python),
+        'PYSPARK_DRIVER_PYTHON': which(pyspark_python)
+    })
     LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
     start_time = time.time()
     try:
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 33af624cfd..2c7358e59a 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -763,11 +763,15 @@ abstract class HiveThriftServer2Test extends SparkFunSuite with BeforeAndAfterAl
         extraEnvironment = Map(
           // Disables SPARK_TESTING to exclude log4j.properties in test directories.
           "SPARK_TESTING" -> "0",
+          // But set SPARK_SQL_TESTING to make spark-class happy.
+          "SPARK_SQL_TESTING" -> "1",
           // Points SPARK_PID_DIR to SPARK_HOME, otherwise only 1 Thrift server instance can be
           // started at a time, which is not Jenkins friendly.
           "SPARK_PID_DIR" -> pidDir.getCanonicalPath),
         redirectStderr = true)
 
+      logInfo(s"COMMAND: $command")
+      logInfo(s"OUTPUT: $lines")
       lines.split("\n").collectFirst {
         case line if line.contains(LOG_FILE_MARK) => new File(line.drop(LOG_FILE_MARK.length))
       }.getOrElse {
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 58efd80512..61504becf1 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -225,30 +225,6 @@
           <argLine>-da -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
         </configuration>
       </plugin>
-
-      <!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>copy-dependencies</id>
-            <phase>package</phase>
-            <goals>
-              <goal>copy-dependencies</goal>
-            </goals>
-            <configuration>
-              <!-- basedir is spark/sql/hive/ -->
-              <outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
-              <overWriteReleases>false</overWriteReleases>
-              <overWriteSnapshots>false</overWriteSnapshots>
-              <overWriteIfNewer>true</overWriteIfNewer>
-              <includeGroupIds>org.datanucleus</includeGroupIds>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-
     </plugins>
   </build>
 </project>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 4dd3ccdf37..336e29fc6b 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -447,9 +447,6 @@ private[spark] class Client(
      *
      * Note that the archive cannot be a "local" URI. If none of the above settings are found,
      * then upload all files found in $SPARK_HOME/jars.
-     *
-     * TODO: currently the code looks in $SPARK_HOME/lib while the work to replace assemblies
-     * with a directory full of jars is ongoing.
      */
     val sparkArchive = sparkConf.get(SPARK_ARCHIVE)
     if (sparkArchive.isDefined) {
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 2eaafa072a..74e268dc48 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -273,7 +273,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
 
   test("distribute local spark jars") {
     val temp = Utils.createTempDir()
-    val jarsDir = new File(temp, "lib")
+    val jarsDir = new File(temp, "jars")
     assert(jarsDir.mkdir())
     val jar = TestUtils.createJarWithFiles(Map(), jarsDir)
     new FileOutputStream(new File(temp, "RELEASE")).close()
author	Marcelo Vanzin <vanzin@cloudera.com>	2016-04-04 16:52:21 -0700
committer	Josh Rosen <joshrosen@databricks.com>	2016-04-04 16:52:22 -0700
commit	24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a (patch)
tree	2069beb0e471afa4e1b1867efe786100b7f77f79
parent	400b2f863ffaa01a34a8dae1541c61526fef908b (diff)
download	spark-24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a.tar.gz spark-24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a.tar.bz2 spark-24d7d2e453ab5eef6099a32fb9e8ed60f6ada93a.zip