aboutsummaryrefslogtreecommitdiff
path: root/project
diff options
context:
space:
mode:
authorMarcelo Vanzin <vanzin@cloudera.com>2015-04-14 13:41:38 -0700
committerAndrew Or <andrew@databricks.com>2015-04-14 13:41:38 -0700
commit65774370a1275e25cd8a3357e397d116767793a9 (patch)
tree741ac9a7e131bd3ab69c98b1c7b5735df0dabc22 /project
parent6adb8bcbf0a1a7bfe2990de18c59c66cd7a0aeb8 (diff)
downloadspark-65774370a1275e25cd8a3357e397d116767793a9.tar.gz
spark-65774370a1275e25cd8a3357e397d116767793a9.tar.bz2
spark-65774370a1275e25cd8a3357e397d116767793a9.zip
[SPARK-5808] [build] Package pyspark files in sbt assembly.
This turned out to be more complicated than I wanted because the layout of python/ doesn't really follow the usual maven conventions. So some extra code is needed to copy just the right things. Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #5461 from vanzin/SPARK-5808 and squashes the following commits: 7153dac [Marcelo Vanzin] Only try to create resource dir if it doesn't already exist. ee90e84 [Marcelo Vanzin] [SPARK-5808] [build] Package pyspark files in sbt assembly.
Diffstat (limited to 'project')
-rw-r--r--project/SparkBuild.scala60
1 files changed, 59 insertions, 1 deletions
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 5f51f4b58f..09b4976d10 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-import java.io.File
+import java.io._
import scala.util.Properties
import scala.collection.JavaConversions._
@@ -166,6 +166,9 @@ object SparkBuild extends PomBuild {
/* Enable Assembly for all assembly projects */
assemblyProjects.foreach(enable(Assembly.settings))
+ /* Package pyspark artifacts in the main assembly. */
+ enable(PySparkAssembly.settings)(assembly)
+
/* Enable unidoc only for the root spark project */
enable(Unidoc.settings)(spark)
@@ -316,6 +319,7 @@ object Hive {
}
object Assembly {
+ import sbtassembly.AssemblyUtils._
import sbtassembly.Plugin._
import AssemblyKeys._
@@ -347,6 +351,60 @@ object Assembly {
)
}
+object PySparkAssembly {
+ import sbtassembly.Plugin._
+ import AssemblyKeys._
+
+ lazy val settings = Seq(
+ unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
+ // Use a resource generator to copy all .py files from python/pyspark into a managed directory
+ // to be included in the assembly. We can't just add "python/" to the assembly's resource dir
+ // list since that will copy unneeded / unwanted files.
+ resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File =>
+ val dst = new File(outDir, "pyspark")
+ if (!dst.isDirectory()) {
+ require(dst.mkdirs())
+ }
+
+ val src = new File(BuildCommons.sparkHome, "python/pyspark")
+ copy(src, dst)
+ }
+ )
+
+ private def copy(src: File, dst: File): Seq[File] = {
+ src.listFiles().flatMap { f =>
+ val child = new File(dst, f.getName())
+ if (f.isDirectory()) {
+ child.mkdir()
+ copy(f, child)
+ } else if (f.getName().endsWith(".py")) {
+ var in: Option[FileInputStream] = None
+ var out: Option[FileOutputStream] = None
+ try {
+ in = Some(new FileInputStream(f))
+ out = Some(new FileOutputStream(child))
+
+ val bytes = new Array[Byte](1024)
+ var read = 0
+ while (read >= 0) {
+ read = in.get.read(bytes)
+ if (read > 0) {
+ out.get.write(bytes, 0, read)
+ }
+ }
+
+ Some(child)
+ } finally {
+ in.foreach(_.close())
+ out.foreach(_.close())
+ }
+ } else {
+ None
+ }
+ }
+ }
+}
+
object Unidoc {
import BuildCommons._