SPARK-1314: Use SPARK_HIVE to determine if we include Hive in packaging

Previously, we based our decision regarding including datanucleus jars based on the existence of a spark-hive-assembly jar, which was incidentally built whenever "sbt assembly" is run. This means that a typical and previously supported pathway would start using hive jars. This patch has the following features/bug fixes: - Use of SPARK_HIVE (default false) to determine if we should include Hive in the assembly jar. - Analagous feature in Maven with -Phive (previously, there was no support for adding Hive to any of our jars produced by Maven) - assemble-deps fixed since we no longer use a different ASSEMBLY_DIR - avoid adding log message in compute-classpath.sh to the classpath :) Still TODO before mergeable: - We need to download the datanucleus jars outside of sbt. Perhaps we can have spark-class download them if SPARK_HIVE is set similar to how sbt downloads itself. - Spark SQL documentation updates. Author: Aaron Davidson <aaron@databricks.com> Closes #237 from aarondav/master and squashes the following commits: 5dc4329 [Aaron Davidson] Typo fixes dd4f298 [Aaron Davidson] Doc update dd1a365 [Aaron Davidson] Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from Maven a9269b5 [Aaron Davidson] [WIP] Use SPARK_HIVE to determine if we include Hive in packaging
author: Aaron Davidson <aaron@databricks.com> 2014-04-06 17:48:41 -0700
committer: Patrick Wendell <pwendell@gmail.com> 2014-04-06 17:48:41 -0700
commit: 4106558435889261243d186f5f0b51c5f9e98d56 (patch)
tree: 6735046be9dbc5048867a619a951c39d884f3d1f /project/SparkBuild.scala
parent: 7ce52c4a7a07b0db5e7c1312b1920efb1165ce6a (diff)
download: spark-4106558435889261243d186f5f0b51c5f9e98d56.tar.gz
spark-4106558435889261243d186f5f0b51c5f9e98d56.tar.bz2
spark-4106558435889261243d186f5f0b51c5f9e98d56.zip
1 files changed, 16 insertions, 9 deletions
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 3489b43d43..d1e4b8b964 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -43,6 +43,8 @@ object SparkBuild extends Build {
 
   val DEFAULT_YARN = false
 
+  val DEFAULT_HIVE = false
+
   // HBase version; set as appropriate.
   val HBASE_VERSION = "0.94.6"
 
@@ -67,15 +69,17 @@ object SparkBuild extends Build {
 
   lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core, catalyst)
 
-  // Since hive is its own assembly, it depends on all of the modules.
-  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql, graphx, bagel, mllib, streaming, repl)
+  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)
+
+  lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
+  lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()
 
   lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)
 
   lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
 
   lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
-    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
+    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
 
   lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
 
@@ -101,6 +105,11 @@ object SparkBuild extends Build {
   lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
   val maybeAvro = if (hadoopVersion.startsWith("0.23.") && isYarnEnabled) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()
 
+  lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
+    case None => DEFAULT_HIVE
+    case Some(v) => v.toBoolean
+  }
+
   // Include Ganglia integration if the user has enabled Ganglia
   // This is isolated from the normal build due to LGPL-licensed code in the library
   lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
@@ -141,13 +150,13 @@ object SparkBuild extends Build {
   lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
 
   lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
-    .dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter, hive) dependsOn(allExternal: _*)
+    .dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)
 
   // Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
-  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeGangliaRef
+  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef
 
   lazy val allProjects = packageProjects ++ allExternalRefs ++
-    Seq[ProjectReference](examples, tools, assemblyProj, hive) ++ maybeJava8Tests
+    Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests
 
   def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
     organization       := "org.apache.spark",
@@ -417,10 +426,8 @@ object SparkBuild extends Build {
 
   // Since we don't include hive in the main assembly this project also acts as an alternative
   // assembly jar.
-  def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
+  def hiveSettings = sharedSettings ++ Seq(
     name := "spark-hive",
-    jarName in assembly <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
-    jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
     javaOptions += "-XX:MaxPermSize=1g",
     libraryDependencies ++= Seq(
       "org.apache.hive" % "hive-metastore" % hiveVersion,
author	Aaron Davidson <aaron@databricks.com>	2014-04-06 17:48:41 -0700
committer	Patrick Wendell <pwendell@gmail.com>	2014-04-06 17:48:41 -0700
commit	4106558435889261243d186f5f0b51c5f9e98d56 (patch)
tree	6735046be9dbc5048867a619a951c39d884f3d1f /project/SparkBuild.scala
parent	7ce52c4a7a07b0db5e7c1312b1920efb1165ce6a (diff)
download	spark-4106558435889261243d186f5f0b51c5f9e98d56.tar.gz spark-4106558435889261243d186f5f0b51c5f9e98d56.tar.bz2 spark-4106558435889261243d186f5f0b51c5f9e98d56.zip