aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--README.md18
-rwxr-xr-xbin/pyspark2
-rwxr-xr-xbin/run-example2
-rwxr-xr-xbin/spark-class2
-rw-r--r--core/src/main/scala/org/apache/spark/SparkConf.scala8
-rw-r--r--core/src/main/scala/org/apache/spark/SparkContext.scala4
-rw-r--r--core/src/main/scala/org/apache/spark/util/AkkaUtils.scala8
-rw-r--r--core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala2
-rw-r--r--docs/README.md4
-rw-r--r--docs/_plugins/copy_api_dirs.rb4
-rw-r--r--docs/api.md2
-rw-r--r--docs/configuration.md18
-rw-r--r--docs/hadoop-third-party-distributions.md2
-rw-r--r--docs/index.md6
-rw-r--r--docs/python-programming-guide.md2
-rw-r--r--docs/quick-start.md8
-rw-r--r--docs/running-on-yarn.md6
-rw-r--r--docs/scala-programming-guide.md2
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewGenerator.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewStream.scala4
-rwxr-xr-xmake-distribution.sh9
-rw-r--r--pom.xml3
-rw-r--r--project/SparkBuild.scala10
-rw-r--r--project/build.properties1
-rwxr-xr-xsbin/stop-slaves.sh4
-rwxr-xr-xsbt/sbt43
33 files changed, 133 insertions, 62 deletions
diff --git a/.gitignore b/.gitignore
index 399362f7d3..39635d7eef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
*.iml
*.iws
.idea/
+sbt/*.jar
.settings
.cache
/build/
diff --git a/README.md b/README.md
index 6daa4633ae..b91e4cf867 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,11 @@ This README file only contains basic setup instructions.
## Building
Spark requires Scala 2.10. The project is built using Simple Build Tool (SBT),
-which can be obtained [here](http://www.scala-sbt.org). To build Spark and its example programs, run:
+which can be obtained [here](http://www.scala-sbt.org). If SBT is installed we
+will use the system version of sbt otherwise we will attempt to download it
+automatically. To build Spark and its example programs, run:
- sbt assembly
+ ./sbt/sbt assembly
Once you've built Spark, the easiest way to start using it is the shell:
@@ -41,7 +43,7 @@ locally with one thread, or "local[N]" to run locally with N threads.
Testing first requires [Building](#Building) Spark. Once Spark is built, tests
can be run using:
-`sbt test`
+`./sbt/sbt test`
## A Note About Hadoop Versions
@@ -55,22 +57,22 @@ For Apache Hadoop versions 1.x, Cloudera CDH MRv1, and other Hadoop
versions without YARN, use:
# Apache Hadoop 1.2.1
- $ SPARK_HADOOP_VERSION=1.2.1 sbt assembly
+ $ SPARK_HADOOP_VERSION=1.2.1 sbt/sbt assembly
# Cloudera CDH 4.2.0 with MapReduce v1
- $ SPARK_HADOOP_VERSION=2.0.0-mr1-cdh4.2.0 sbt assembly
+ $ SPARK_HADOOP_VERSION=2.0.0-mr1-cdh4.2.0 sbt/sbt assembly
For Apache Hadoop 2.2.X, 2.1.X, 2.0.X, 0.23.x, Cloudera CDH MRv2, and other Hadoop versions
with YARN, also set `SPARK_YARN=true`:
# Apache Hadoop 2.0.5-alpha
- $ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt assembly
+ $ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly
# Cloudera CDH 4.2.0 with MapReduce v2
- $ SPARK_HADOOP_VERSION=2.0.0-cdh4.2.0 SPARK_YARN=true sbt assembly
+ $ SPARK_HADOOP_VERSION=2.0.0-cdh4.2.0 SPARK_YARN=true sbt/sbt assembly
# Apache Hadoop 2.2.X and newer
- $ SPARK_HADOOP_VERSION=2.2.0 SPARK_YARN=true sbt assembly
+ $ SPARK_HADOOP_VERSION=2.2.0 SPARK_YARN=true sbt/sbt assembly
When developing a Spark application, specify the Hadoop version by adding the
"hadoop-client" artifact to your project's dependencies. For example, if you're
diff --git a/bin/pyspark b/bin/pyspark
index f97dfa7e2f..d6810f4686 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -31,7 +31,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null
if [[ $? != 0 ]]; then
echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2
- echo "You need to build Spark with sbt assembly before running this program" >&2
+ echo "You need to build Spark with sbt/sbt assembly before running this program" >&2
exit 1
fi
fi
diff --git a/bin/run-example b/bin/run-example
index dfb4bf7baf..6c5d4a6a8f 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -55,7 +55,7 @@ if [ -e "$EXAMPLES_DIR"/target/spark-examples*[0-9Tg].jar ]; then
fi
if [[ -z $SPARK_EXAMPLES_JAR ]]; then
echo "Failed to find Spark examples assembly in $FWDIR/examples/target" >&2
- echo "You need to build Spark with sbt assembly before running this program" >&2
+ echo "You need to build Spark with sbt/sbt assembly before running this program" >&2
exit 1
fi
diff --git a/bin/spark-class b/bin/spark-class
index 49b0bef0bd..c4225a392d 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -104,7 +104,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
if [ "$num_jars" -eq "0" ]; then
echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
- echo "You need to build Spark with 'sbt assembly' before running this program." >&2
+ echo "You need to build Spark with 'sbt/sbt assembly' before running this program." >&2
exit 1
fi
if [ "$num_jars" -gt "1" ]; then
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 98343e9532..b166527614 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -24,7 +24,7 @@ import com.typesafe.config.ConfigFactory
*
* @param loadDefaults whether to load values from the system properties and classpath
*/
-class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable {
+class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable with Logging {
/** Create a SparkConf that loads defaults from system properties and the classpath */
def this() = this(true)
@@ -67,7 +67,8 @@ class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable {
/** Set JAR files to distribute to the cluster. */
def setJars(jars: Seq[String]): SparkConf = {
- set("spark.jars", jars.mkString(","))
+ for (jar <- jars if (jar == null)) logWarning("null jar passed to SparkContext constructor")
+ set("spark.jars", jars.filter(_ != null).mkString(","))
}
/** Set JAR files to distribute to the cluster. (Java-friendly version.) */
@@ -171,6 +172,9 @@ class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable {
.map{case (k, v) => (k.substring(prefix.length), v)}
}
+ /** Get all akka conf variables set on this SparkConf */
+ def getAkkaConf: Seq[(String, String)] = getAll.filter {case (k, v) => k.startsWith("akka.")}
+
/** Does the configuration contain a given parameter? */
def contains(key: String): Boolean = settings.contains(key)
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index e80e43af6d..99dcced7d7 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -116,6 +116,10 @@ class SparkContext(
throw new SparkException("An application must be set in your configuration")
}
+ if (conf.get("spark.log-conf", "false").toBoolean) {
+ logInfo("Spark configuration:\n" + conf.toDebugString)
+ }
+
// Set Spark driver host and port system properties
conf.setIfMissing("spark.driver.host", Utils.localHostName())
conf.setIfMissing("spark.driver.port", "0")
diff --git a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
index 7df7e3d8e5..2ee37815de 100644
--- a/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/AkkaUtils.scala
@@ -17,12 +17,13 @@
package org.apache.spark.util
+import scala.collection.JavaConversions.mapAsJavaMap
import scala.concurrent.duration.{Duration, FiniteDuration}
import akka.actor.{ActorSystem, ExtendedActorSystem, IndestructibleActorSystem}
import com.typesafe.config.ConfigFactory
-import org.apache.log4j.{Level, Logger}
+import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
/**
@@ -64,7 +65,8 @@ private[spark] object AkkaUtils {
conf.get("spark.akka.failure-detector.threshold", "300.0").toDouble
val akkaHeartBeatInterval = conf.get("spark.akka.heartbeat.interval", "1000").toInt
- val akkaConf = ConfigFactory.parseString(
+ val akkaConf = ConfigFactory.parseMap(conf.getAkkaConf.toMap[String, String]).withFallback(
+ ConfigFactory.parseString(
s"""
|akka.daemonic = on
|akka.loggers = [""akka.event.slf4j.Slf4jLogger""]
@@ -86,7 +88,7 @@ private[spark] object AkkaUtils {
|akka.remote.log-remote-lifecycle-events = $lifecycleEvents
|akka.log-dead-letters = $lifecycleEvents
|akka.log-dead-letters-during-shutdown = $lifecycleEvents
- """.stripMargin)
+ """.stripMargin))
val actorSystem = if (indestructible) {
IndestructibleActorSystem(name, akkaConf)
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
index f58b1ee05a..7e5aaa3f98 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
@@ -26,7 +26,7 @@ import org.apache.spark.deploy.{ExecutorState, Command, ApplicationDescription}
class ExecutorRunnerTest extends FunSuite {
test("command includes appId") {
def f(s:String) = new File(s)
- val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.env.get("spark.home")).get
+ val sparkHome = sys.env.get("SPARK_HOME").orElse(sys.props.get("spark.home")).get
val appDesc = new ApplicationDescription("app name", 8, 500, Command("foo", Seq(),Map()),
sparkHome, "appUiUrl")
val appId = "12345-worker321-9876"
diff --git a/docs/README.md b/docs/README.md
index e3d6c9a5bc..dfcf753553 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -27,10 +27,10 @@ To mark a block of code in your markdown to be syntax highlighted by jekyll duri
## API Docs (Scaladoc and Epydoc)
-You can build just the Spark scaladoc by running `sbt doc` from the SPARK_PROJECT_ROOT directory.
+You can build just the Spark scaladoc by running `sbt/sbt doc` from the SPARK_PROJECT_ROOT directory.
Similarly, you can build just the PySpark epydoc by running `epydoc --config epydoc.conf` from the SPARK_PROJECT_ROOT/pyspark directory.
-When you run `jekyll` in the docs directory, it will also copy over the scaladoc for the various Spark subprojects into the docs directory (and then also into the _site directory). We use a jekyll plugin to run `sbt doc` before building the site so if you haven't run it (recently) it may take some time as it generates all of the scaladoc. The jekyll plugin also generates the PySpark docs using [epydoc](http://epydoc.sourceforge.net/).
+When you run `jekyll` in the docs directory, it will also copy over the scaladoc for the various Spark subprojects into the docs directory (and then also into the _site directory). We use a jekyll plugin to run `sbt/sbt doc` before building the site so if you haven't run it (recently) it may take some time as it generates all of the scaladoc. The jekyll plugin also generates the PySpark docs using [epydoc](http://epydoc.sourceforge.net/).
NOTE: To skip the step of building and copying over the Scala and Python API docs, run `SKIP_API=1 jekyll`.
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index ef9912c808..431de909cb 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -26,8 +26,8 @@ if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1')
curr_dir = pwd
cd("..")
- puts "Running sbt doc from " + pwd + "; this may take a few minutes..."
- puts `sbt doc`
+ puts "Running sbt/sbt doc from " + pwd + "; this may take a few minutes..."
+ puts `sbt/sbt doc`
puts "Moving back into docs dir."
cd("docs")
diff --git a/docs/api.md b/docs/api.md
index 11e2c15324..e86d07770a 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -3,7 +3,7 @@ layout: global
title: Spark API documentation (Scaladoc)
---
-Here you can find links to the Scaladoc generated for the Spark sbt subprojects. If the following links don't work, try running `sbt doc` from the Spark project home directory.
+Here you can find links to the Scaladoc generated for the Spark sbt subprojects. If the following links don't work, try running `sbt/sbt doc` from the Spark project home directory.
- [Spark](api/core/index.html)
- [Spark Examples](api/examples/index.html)
diff --git a/docs/configuration.md b/docs/configuration.md
index 567aba07f0..1d36ecb9c1 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -81,7 +81,8 @@ there are at least five properties that you will commonly want to control:
<td>
When running on a <a href="spark-standalone.html">standalone deploy cluster</a> or a
<a href="running-on-mesos.html#mesos-run-modes">Mesos cluster in "coarse-grained"
- sharing mode</a>, how many CPU cores to request at most. The default will use all available cores
+ sharing mode</a>, the maximum amount of CPU cores to request for the application from
+ across the cluster (not from each machine). The default will use all available cores
offered by the cluster manager.
</td>
</tr>
@@ -360,6 +361,14 @@ Apart from these, the following properties are also available, and may be useful
</td>
</tr>
<tr>
+ <td>akka.x.y....</td>
+ <td>value</td>
+ <td>
+ An arbitrary akka configuration can be set directly on spark conf and it is applied for all the ActorSystems created spark wide for that SparkContext and its assigned executors as well.
+ </td>
+</tr>
+
+<tr>
<td>spark.shuffle.consolidateFiles</td>
<td>false</td>
<td>
@@ -394,6 +403,13 @@ Apart from these, the following properties are also available, and may be useful
How many times slower a task is than the median to be considered for speculation.
</td>
</tr>
+<tr>
+ <td>spark.log-conf</td>
+ <td>false</td>
+ <td>
+ Log the supplied SparkConf as INFO at start of spark context.
+ </td>
+</tr>
</table>
## Viewing Spark Properties
diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md
index 141d475ba6..de6a2b0a43 100644
--- a/docs/hadoop-third-party-distributions.md
+++ b/docs/hadoop-third-party-distributions.md
@@ -12,7 +12,7 @@ with these distributions:
When compiling Spark, you'll need to
[set the SPARK_HADOOP_VERSION flag](index.html#a-note-about-hadoop-versions):
- SPARK_HADOOP_VERSION=1.0.4 sbt assembly
+ SPARK_HADOOP_VERSION=1.0.4 sbt/sbt assembly
The table below lists the corresponding `SPARK_HADOOP_VERSION` code for each CDH/HDP release. Note that
some Hadoop releases are binary compatible across client versions. This means the pre-built Spark
diff --git a/docs/index.md b/docs/index.md
index bf8d1c3375..86d574daaa 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,7 +17,7 @@ Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). All you n
Spark uses [Simple Build Tool](http://www.scala-sbt.org), which is bundled with it. To compile the code, go into the top-level Spark directory and run
- sbt assembly
+ sbt/sbt assembly
For its Scala API, Spark {{site.SPARK_VERSION}} depends on Scala {{site.SCALA_VERSION}}. If you write applications in Scala, you will need to use this same version of Scala in your own program -- newer major versions may not work. You can get the right version of Scala from [scala-lang.org](http://www.scala-lang.org/download/).
@@ -56,12 +56,12 @@ Hadoop, you must build Spark against the same version that your cluster uses.
By default, Spark links to Hadoop 1.0.4. You can change this by setting the
`SPARK_HADOOP_VERSION` variable when compiling:
- SPARK_HADOOP_VERSION=2.2.0 sbt assembly
+ SPARK_HADOOP_VERSION=2.2.0 sbt/sbt assembly
In addition, if you wish to run Spark on [YARN](running-on-yarn.html), set
`SPARK_YARN` to `true`:
- SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt assembly
+ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly
Note that on Windows, you need to set the environment variables on separate lines, e.g., `set SPARK_HADOOP_VERSION=1.2.1`.
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md
index 5d48cb676a..dc187b3efe 100644
--- a/docs/python-programming-guide.md
+++ b/docs/python-programming-guide.md
@@ -69,7 +69,7 @@ The script automatically adds the `bin/pyspark` package to the `PYTHONPATH`.
The `bin/pyspark` script launches a Python interpreter that is configured to run PySpark applications. To use `pyspark` interactively, first build Spark, then launch it directly from the command line without any options:
{% highlight bash %}
-$ sbt assembly
+$ sbt/sbt assembly
$ ./bin/pyspark
{% endhighlight %}
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 9b9261cfff..153081bdaa 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -12,7 +12,7 @@ See the [programming guide](scala-programming-guide.html) for a more complete re
To follow along with this guide, you only need to have successfully built Spark on one machine. Simply go into your Spark directory and run:
{% highlight bash %}
-$ sbt assembly
+$ sbt/sbt assembly
{% endhighlight %}
# Interactive Analysis with the Spark Shell
@@ -146,7 +146,7 @@ If you also wish to read data from Hadoop's HDFS, you will also need to add a de
libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "<your-hdfs-version>"
{% endhighlight %}
-Finally, for sbt to work correctly, we'll need to layout `SimpleApp.scala` and `simple.sbt` according to the typical directory structure. Once that is in place, we can create a JAR package containing the application's code, then use `sbt run` to execute our program.
+Finally, for sbt to work correctly, we'll need to layout `SimpleApp.scala` and `simple.sbt` according to the typical directory structure. Once that is in place, we can create a JAR package containing the application's code, then use `sbt/sbt run` to execute our program.
{% highlight bash %}
$ find .
@@ -157,8 +157,8 @@ $ find .
./src/main/scala
./src/main/scala/SimpleApp.scala
-$ sbt package
-$ sbt run
+$ sbt/sbt package
+$ sbt/sbt run
...
Lines with a: 46, Lines with b: 23
{% endhighlight %}
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index a35e003cdc..717071d72c 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -12,7 +12,7 @@ was added to Spark in version 0.6.0, and improved in 0.7.0 and 0.8.0.
We need a consolidated Spark JAR (which bundles all the required dependencies) to run Spark jobs on a YARN cluster.
This can be built by setting the Hadoop version and `SPARK_YARN` environment variable, as follows:
- SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt assembly
+ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly
The assembled JAR will be something like this:
`./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly_{{site.SPARK_VERSION}}-hadoop2.0.5.jar`.
@@ -25,7 +25,7 @@ The build process now also supports new YARN versions (2.2.x). See below.
- The assembled jar can be installed into HDFS or used locally.
- Your application code must be packaged into a separate JAR file.
-If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different.
+If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different.
# Configuration
@@ -72,7 +72,7 @@ The command to launch the YARN Client is as follows:
For example:
# Build the Spark assembly JAR and the Spark examples JAR
- $ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt assembly
+ $ SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly
# Configure logging
$ cp conf/log4j.properties.template conf/log4j.properties
diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md
index 3d0e8923d5..c1ef46a1cd 100644
--- a/docs/scala-programming-guide.md
+++ b/docs/scala-programming-guide.md
@@ -31,7 +31,7 @@ In addition, if you wish to access an HDFS cluster, you need to add a dependency
artifactId = hadoop-client
version = <your-hdfs-version>
-For other build systems, you can run `sbt assembly` to pack Spark and its dependencies into one JAR (`assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop*.jar`), then add this to your CLASSPATH. Set the HDFS version as described [here](index.html#a-note-about-hadoop-versions).
+For other build systems, you can run `sbt/sbt assembly` to pack Spark and its dependencies into one JAR (`assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop*.jar`), then add this to your CLASSPATH. Set the HDFS version as described [here](index.html#a-note-about-hadoop-versions).
Finally, you need to import some Spark classes and implicit conversions into your program. Add the following lines:
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala
index 546495357f..4e0058cd70 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala
@@ -134,9 +134,9 @@ object FeederActor {
* <hostname> and <port> describe the AkkaSystem that Spark Sample feeder is running on.
*
* To run this example locally, you may run Feeder Actor as
- * `$ ./bin/run-example spark.streaming.examples.FeederActor 127.0.1.1 9999`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.FeederActor 127.0.1.1 9999`
* and then run the example
- * `$ ./bin/run-example spark.streaming.examples.ActorWordCount local[2] 127.0.1.1 9999`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.ActorWordCount local[2] 127.0.1.1 9999`
*/
object ActorWordCount {
def main(args: Array[String]) {
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala
index 1486d77d8a..ea6ea67419 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala
@@ -28,7 +28,7 @@ import org.apache.spark.streaming.StreamingContext._
* <directory> is the directory that Spark Streaming will use to find and read new text files.
*
* To run this on your local machine on directory `localdir`, run this example
- * `$ ./bin/run-example spark.streaming.examples.HdfsWordCount local[2] localdir`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.HdfsWordCount local[2] localdir`
* Then create a text file in `localdir` and the words in the file will get counted.
*/
object HdfsWordCount {
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala
index 172091be2e..197461655e 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala
@@ -35,7 +35,7 @@ import org.apache.spark.streaming.util.RawTextHelper._
* <numThreads> is the number of threads the kafka consumer should use
*
* Example:
- * `./bin/run-example spark.streaming.examples.KafkaWordCount local[2] zoo01,zoo02,zoo03 my-consumer-group topic1,topic2 1`
+ * `./bin/run-example org.apache.spark.streaming.examples.KafkaWordCount local[2] zoo01,zoo02,zoo03 my-consumer-group topic1,topic2 1`
*/
object KafkaWordCount {
def main(args: Array[String]) {
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala
index 74d76ec26c..6a32c75373 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala
@@ -29,7 +29,7 @@ import org.apache.spark.streaming.StreamingContext._
* To run this on your local machine, you need to first run a Netcat server
* `$ nc -lk 9999`
* and then run the example
- * `$ ./bin/run-example spark.streaming.examples.NetworkWordCount local[2] localhost 9999`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999`
*/
object NetworkWordCount {
def main(args: Array[String]) {
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala
index f43c8ab61d..002db57d59 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala
@@ -29,7 +29,7 @@ import org.apache.spark.streaming.StreamingContext._
* To run this on your local machine, you need to first run a Netcat server
* `$ nc -lk 9999`
* and then run the example
- * `$ ./bin/run-example spark.streaming.examples.StatefulNetworkWordCount local[2] localhost 9999`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.StatefulNetworkWordCount local[2] localhost 9999`
*/
object StatefulNetworkWordCount {
def main(args: Array[String]) {
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala
index 89d3042123..beda73a71b 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala
@@ -62,9 +62,9 @@ object SimpleZeroMQPublisher {
* <zeroMQurl> and <topic> describe where zeroMq publisher is running.
*
* To run this example locally, you may run publisher as
- * `$ ./bin/run-example spark.streaming.examples.SimpleZeroMQPublisher tcp://127.0.1.1:1234 foo.bar`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.SimpleZeroMQPublisher tcp://127.0.1.1:1234 foo.bar`
* and run the example as
- * `$ ./bin/run-example spark.streaming.examples.ZeroMQWordCount local[2] tcp://127.0.1.1:1234 foo`
+ * `$ ./bin/run-example org.apache.spark.streaming.examples.ZeroMQWordCount local[2] tcp://127.0.1.1:1234 foo`
*/
object ZeroMQWordCount {
def main(args: Array[String]) {
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewGenerator.scala
index 1a40fdb9a3..4fe57de4a4 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewGenerator.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewGenerator.scala
@@ -39,8 +39,8 @@ object PageView extends Serializable {
/** Generates streaming events to simulate page views on a website.
*
* This should be used in tandem with PageViewStream.scala. Example:
- * $ ./bin/run-example spark.streaming.examples.clickstream.PageViewGenerator 44444 10
- * $ ./bin/run-example spark.streaming.examples.clickstream.PageViewStream errorRatePerZipCode localhost 44444
+ * $ ./bin/run-example org.apache.spark.streaming.examples.clickstream.PageViewGenerator 44444 10
+ * $ ./bin/run-example org.apache.spark.streaming.examples.clickstream.PageViewStream errorRatePerZipCode localhost 44444
*
* When running this, you may want to set the root logging level to ERROR in
* conf/log4j.properties to reduce the verbosity of the output.
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewStream.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewStream.scala
index 0569846f18..807af199f4 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewStream.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/clickstream/PageViewStream.scala
@@ -25,8 +25,8 @@ import org.apache.spark.SparkContext._
* operators available in Spark streaming.
*
* This should be used in tandem with PageViewStream.scala. Example:
- * $ ./bin/run-example spark.streaming.examples.clickstream.PageViewGenerator 44444 10
- * $ ./bin/run-example spark.streaming.examples.clickstream.PageViewStream errorRatePerZipCode localhost 44444
+ * $ ./bin/run-example org.apache.spark.streaming.examples.clickstream.PageViewGenerator 44444 10
+ * $ ./bin/run-example org.apache.spark.streaming.examples.clickstream.PageViewStream errorRatePerZipCode localhost 44444
*/
object PageViewStream {
def main(args: Array[String]) {
diff --git a/make-distribution.sh b/make-distribution.sh
index 6c466c8a06..1a3a5d0209 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -44,13 +44,16 @@ DISTDIR="$FWDIR/dist"
# Get version from SBT
export TERM=dumb # Prevents color codes in SBT output
-if ! test `which sbt` ;then
+VERSIONSTRING=$FWDIR/sbt/sbt "show version"
+
+if [ $? == -1 ] ;then
echo -e "You need sbt installed and available on your path."
echo -e "Download sbt from http://www.scala-sbt.org/"
exit -1;
fi
-VERSION=$(sbt "show version" | tail -1 | cut -f 2 | sed 's/^\([a-zA-Z0-9.-]*\).*/\1/')
+VERSION=$(echo "${VERSIONSTRING}" | tail -1 | cut -f 2 | sed 's/^\([a-zA-Z0-9.-]*\).*/\1/')
+echo "Version is ${VERSION}"
# Initialize defaults
SPARK_HADOOP_VERSION=1.0.4
@@ -92,7 +95,7 @@ export SPARK_HADOOP_VERSION
export SPARK_YARN
cd $FWDIR
-"sbt" "assembly/assembly"
+"sbt/sbt" "assembly/assembly"
# Make directories
rm -rf "$DISTDIR"
diff --git a/pom.xml b/pom.xml
index aa2f076aac..78d2f162b5 100644
--- a/pom.xml
+++ b/pom.xml
@@ -188,7 +188,7 @@
<dependency>
<groupId>com.ning</groupId>
<artifactId>compress-lzf</artifactId>
- <version>0.8.4</version>
+ <version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.xerial.snappy</groupId>
@@ -727,7 +727,6 @@
<hadoop.major.version>2</hadoop.major.version>
<!-- 0.23.* is same as 2.0.* - except hardened to run production jobs -->
<hadoop.version>0.23.7</hadoop.version>
- <protobuf.version>2.5.0</protobuf.version>
<!--<hadoop.version>2.0.5-alpha</hadoop.version> -->
</properties>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index a6c560d5c6..051e5105f3 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -70,9 +70,7 @@ object SparkBuild extends Build {
lazy val MavenCompile = config("m2r") extend(Compile)
lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
val sparkHome = System.getProperty("user.dir")
- System.setProperty("spark.home", sparkHome)
- System.setProperty("spark.testing", "1")
-
+
// Allows build configuration to be set through environment variables
lazy val hadoopVersion = Properties.envOrElse("SPARK_HADOOP_VERSION", DEFAULT_HADOOP_VERSION)
lazy val isNewHadoop = Properties.envOrNone("SPARK_IS_NEW_HADOOP") match {
@@ -115,8 +113,8 @@ object SparkBuild extends Build {
// Fork new JVMs for tests and set Java options for those
fork := true,
- javaOptions += "-Dspark.home=" + sparkHome,
- javaOptions += "-Dspark.testing=1",
+ javaOptions in Test += "-Dspark.home=" + sparkHome,
+ javaOptions in Test += "-Dspark.testing=1",
javaOptions += "-Xmx3g",
// Show full stack trace and duration in test cases.
testOptions in Test += Tests.Argument("-oDF"),
@@ -230,7 +228,7 @@ object SparkBuild extends Build {
"org.slf4j" % "slf4j-api" % slf4jVersion,
"org.slf4j" % "slf4j-log4j12" % slf4jVersion,
"commons-daemon" % "commons-daemon" % "1.0.10", // workaround for bug HADOOP-9407
- "com.ning" % "compress-lzf" % "0.8.4",
+ "com.ning" % "compress-lzf" % "1.0.0",
"org.xerial.snappy" % "snappy-java" % "1.0.5",
"org.ow2.asm" % "asm" % "4.0",
"org.spark-project.akka" %% "akka-remote" % "2.2.3-shaded-protobuf" excludeAll(excludeNetty),
diff --git a/project/build.properties b/project/build.properties
index 9647277162..839f5fbb0c 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -14,5 +14,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
sbt.version=0.12.4
diff --git a/sbin/stop-slaves.sh b/sbin/stop-slaves.sh
index c6b0b6ab66..eb803b4900 100755
--- a/sbin/stop-slaves.sh
+++ b/sbin/stop-slaves.sh
@@ -17,8 +17,8 @@
# limitations under the License.
#
-bin=`dirname "$0"`
-bin=`cd "$sbin"; pwd`
+sbin=`dirname "$0"`
+sbin=`cd "$sbin"; pwd`
. "$sbin/spark-config.sh"
diff --git a/sbt/sbt b/sbt/sbt
new file mode 100755
index 0000000000..22672f2346
--- /dev/null
+++ b/sbt/sbt
@@ -0,0 +1,43 @@
+#!/bin/bash
+# This script launches sbt for this project. If present it uses the system
+# version of sbt. If there is no system version of sbt it attempts to download
+# sbt locally.
+SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
+URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
+URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
+JAR=sbt/sbt-launch-${SBT_VERSION}.jar
+
+printf "Checking for system sbt ["
+if hash sbt 2>/dev/null; then
+ printf "FOUND]\n"
+ # Use System SBT
+ sbt "$@"
+else
+ printf "NOT FOUND]\n"
+ # Download sbt or use already downloaded
+ if [ ! -d .sbtlib ]; then
+ mkdir .sbtlib
+ fi
+ if [ ! -f ${JAR} ]; then
+ # Download
+ printf "Attempting to fetch sbt\n"
+ if hash curl 2>/dev/null; then
+ curl --progress-bar ${URL1} > ${JAR} || curl --progress-bar ${URL2} > ${JAR}
+ elif hash wget 2>/dev/null; then
+ wget --progress=bar ${URL1} -O ${JAR} || wget --progress=bar ${URL2} -O ${JAR}
+ else
+ printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
+ exit -1
+ fi
+ fi
+ if [ ! -f ${JAR} ]; then
+ # We failed to download
+ printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
+ exit -1
+ fi
+ printf "Launching sbt from ${JAR}\n"
+ java \
+ -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
+ -jar ${JAR} \
+ "$@"
+fi