From 72ff62a37c7310bab02f0231e91d3ba4d423217a Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 28 Jul 2013 22:21:04 -0400 Subject: Two fixes to IPython support: - Don't attempt to run worker processes with ipython (that can cause some crashes as ipython prints things to standard out) - Allow passing some IPYTHON_OPTS to launch things like the notebook --- pyspark | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'pyspark') diff --git a/pyspark b/pyspark index 37a355462e..801239c108 100755 --- a/pyspark +++ b/pyspark @@ -53,9 +53,13 @@ if [[ "$SPARK_LAUNCH_WITH_SCALA" != "0" ]] ; then export SPARK_LAUNCH_WITH_SCALA=1 fi +if [ -n "$IPYTHON_OPTS" ]; then + IPYTHON=1 +fi + if [[ "$IPYTHON" = "1" ]] ; then - export PYSPARK_PYTHON="ipython" - exec "$PYSPARK_PYTHON" -i -c "%run $PYTHONSTARTUP" + IPYTHON_OPTS=${IPYTHON_OPTS:--i} + exec ipython "$IPYTHON_OPTS" -c "%run $PYTHONSTARTUP" else - exec "$PYSPARK_PYTHON" "$@" + exec "$PYSPARK_PYTHON" "$@" fi -- cgit v1.2.3 From ab0e625d9e0abd62a20754125952e3a00f2c275a Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Thu, 22 Aug 2013 23:02:09 -0700 Subject: Fix PySpark for assembly run and include it in dist --- .gitignore | 1 + core/lib/PY4J_LICENSE.txt | 27 +++++++++++++++++++++++++++ core/lib/PY4J_VERSION.txt | 1 + core/lib/py4j0.7.jar | Bin 0 -> 103286 bytes make-distribution.sh | 5 ++++- pyspark | 12 ++++++++---- python/lib/py4j0.7.jar | Bin 103286 -> 0 bytes 7 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 core/lib/PY4J_LICENSE.txt create mode 100644 core/lib/PY4J_VERSION.txt create mode 100644 core/lib/py4j0.7.jar delete mode 100644 python/lib/py4j0.7.jar (limited to 'pyspark') diff --git a/.gitignore b/.gitignore index 00fbff6a2c..e1f64a1133 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,4 @@ checkpoint derby.log dist/ spark-*-bin.tar.gz +unit-tests.log diff --git a/core/lib/PY4J_LICENSE.txt b/core/lib/PY4J_LICENSE.txt new file mode 100644 index 0000000000..a70279ca14 --- /dev/null +++ b/core/lib/PY4J_LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +- The name of the author may not be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/core/lib/PY4J_VERSION.txt b/core/lib/PY4J_VERSION.txt new file mode 100644 index 0000000000..04a0cd52a8 --- /dev/null +++ b/core/lib/PY4J_VERSION.txt @@ -0,0 +1 @@ +b7924aabe9c5e63f0a4d8bbd17019534c7ec014e diff --git a/core/lib/py4j0.7.jar b/core/lib/py4j0.7.jar new file mode 100644 index 0000000000..73b7ddb7d1 Binary files /dev/null and b/core/lib/py4j0.7.jar differ diff --git a/make-distribution.sh b/make-distribution.sh index df7bbf1e74..92b2706126 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -94,11 +94,14 @@ echo "Spark $VERSION built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE cp $FWDIR/assembly/target/*/*assembly*.jar "$DISTDIR/jars/" # Copy other things +mkdir "$DISTDIR"/conf +cp -r "$FWDIR/conf/*.template" "$DISTDIR" cp -r "$FWDIR/bin" "$DISTDIR" -cp -r "$FWDIR/conf" "$DISTDIR" +cp -r "$FWDIR/python" "$DISTDIR" cp "$FWDIR/spark-class" "$DISTDIR" cp "$FWDIR/spark-shell" "$DISTDIR" cp "$FWDIR/spark-executor" "$DISTDIR" +cp "$FWDIR/pyspark" "$DISTDIR" if [ "$MAKE_TGZ" == "true" ]; then diff --git a/pyspark b/pyspark index 801239c108..155ccd4fdf 100755 --- a/pyspark +++ b/pyspark @@ -24,10 +24,14 @@ FWDIR="$(cd `dirname $0`; pwd)" export SPARK_HOME="$FWDIR" # Exit if the user hasn't compiled Spark -if [ ! -e "$SPARK_HOME/repl/target" ]; then - echo "Failed to find Spark classes in $SPARK_HOME/repl/target" >&2 - echo "You need to compile Spark before running this program" >&2 - exit 1 +if [ ! -f "$FWDIR/RELEASE" ]; then + # Exit if the user hasn't compiled Spark + ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*.jar >& /dev/null + if [[ $? != 0 ]]; then + echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2 + echo "You need to compile Spark before running this program" >&2 + exit 1 + fi fi # Load environment variables from conf/spark-env.sh, if it exists diff --git a/python/lib/py4j0.7.jar b/python/lib/py4j0.7.jar deleted file mode 100644 index 73b7ddb7d1..0000000000 Binary files a/python/lib/py4j0.7.jar and /dev/null differ -- cgit v1.2.3 From 3ff105f87db217c0c0af0f176626a416b2669740 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Tue, 27 Aug 2013 15:46:23 -0700 Subject: Find assembly correctly in pyspark --- pyspark | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'pyspark') diff --git a/pyspark b/pyspark index 155ccd4fdf..aedc51e048 100755 --- a/pyspark +++ b/pyspark @@ -23,10 +23,12 @@ FWDIR="$(cd `dirname $0`; pwd)" # Export this as SPARK_HOME export SPARK_HOME="$FWDIR" +SCALA_VERSION=2.9.3 + # Exit if the user hasn't compiled Spark if [ ! -f "$FWDIR/RELEASE" ]; then # Exit if the user hasn't compiled Spark - ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*.jar >& /dev/null + ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null if [[ $? != 0 ]]; then echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2 echo "You need to compile Spark before running this program" >&2 -- cgit v1.2.3 From d7dec938e503b86d1b338c4df3439d3649a76294 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Tue, 27 Aug 2013 15:52:03 -0700 Subject: Don't use SPARK_LAUNCH_WITH_SCALA in pyspark --- pyspark | 5 ----- 1 file changed, 5 deletions(-) (limited to 'pyspark') diff --git a/pyspark b/pyspark index aedc51e048..2dba2ceb21 100755 --- a/pyspark +++ b/pyspark @@ -54,11 +54,6 @@ export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH export OLD_PYTHONSTARTUP=$PYTHONSTARTUP export PYTHONSTARTUP=$FWDIR/python/pyspark/shell.py -# Launch with `scala` by default: -if [[ "$SPARK_LAUNCH_WITH_SCALA" != "0" ]] ; then - export SPARK_LAUNCH_WITH_SCALA=1 -fi - if [ -n "$IPYTHON_OPTS" ]; then IPYTHON=1 fi -- cgit v1.2.3 From f3a964848dd2ba65491f3eea8a54439069aa1b29 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Fri, 30 Aug 2013 12:38:23 -0700 Subject: More doc improvements + better warnings when you haven't built Spark --- docs/_layouts/global.html | 15 +++++++++---- docs/img/incubator-logo.png | Bin 0 -> 11651 bytes docs/index.md | 53 ++++++++++++++++++++++++++------------------ docs/running-on-yarn.md | 39 +++++++++----------------------- pyspark | 2 +- run-example | 2 +- spark-class | 4 ++-- 7 files changed, 58 insertions(+), 57 deletions(-) create mode 100644 docs/img/incubator-logo.png (limited to 'pyspark') diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index a014554462..91a4a2eaee 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -66,6 +66,7 @@
  • Spark in Python
  • Spark Streaming
  • +
  • MLlib (Machine Learning)
  • Bagel (Pregel on Spark)
  • @@ -77,8 +78,8 @@
  • Spark Core for Python
  • Spark Streaming
  • -
  • Bagel (Pregel on Spark)
  • MLlib (Machine Learning)
  • +
  • Bagel (Pregel on Spark)
  • @@ -140,9 +141,15 @@
    --> - +
    +
    +

    + Apache Spark is an effort undergoing incubation at the Apache Software Foundation. + + + +

    +
    diff --git a/docs/img/incubator-logo.png b/docs/img/incubator-logo.png new file mode 100644 index 0000000000..33ca7f6227 Binary files /dev/null and b/docs/img/incubator-logo.png differ diff --git a/docs/index.md b/docs/index.md index ec9c7dd4f3..5aa7f74059 100644 --- a/docs/index.md +++ b/docs/index.md @@ -3,13 +3,13 @@ layout: global title: Spark Overview --- -Spark is a MapReduce-like cluster computing framework designed for low-latency iterative jobs and interactive use from an interpreter. -It provides clean, language-integrated APIs in [Scala](scala-programming-guide.html), [Java](java-programming-guide.html), and [Python](python-programming-guide.html), with a rich array of parallel operators. +Apache Spark is a cluster computing engine that aims to make data analytics both easier and faster. +It provides rich, language-integrated APIs in [Scala](scala-programming-guide.html), [Java](java-programming-guide.html), and [Python](python-programming-guide.html), and a powerful execution engine that supports general operator graphs. Spark can run on the Apache Mesos cluster manager, Hadoop YARN, Amazon EC2, or without an independent resource manager ("standalone mode"). # Downloading -Get Spark by visiting the [downloads page](http://spark-project.org/downloads.html) of the Spark website. This documentation is for Spark version {{site.SPARK_VERSION}}. +Get Spark from the [downloads page](http://spark.incubator.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}. # Building @@ -42,11 +42,17 @@ Finally, Spark can be used interactively from a modified version of the Scala in # A Note About Hadoop Versions -Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported +Spark uses the Hadoop-client library to talk to HDFS and other Hadoop-supported storage systems. Because the HDFS protocol has changed in different versions of -Hadoop, you must build Spark against the same version that your cluster runs. -You can change the version by setting the `HADOOP_VERSION` variable at the top -of `project/SparkBuild.scala`, then rebuilding Spark (`sbt/sbt clean compile`). +Hadoop, you must build Spark against the same version that your cluster uses. +You can do this by setting the `SPARK_HADOOP_VERSION` variable when compiling: + + SPARK_HADOOP_VERSION=1.2.1 sbt/sbt assembly + +In addition, if you wish to run Spark on [YARN](running-on-yarn.md), you should also +set `SPARK_YARN` to `true`: + + SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly # Where to Go from Here @@ -54,15 +60,20 @@ of `project/SparkBuild.scala`, then rebuilding Spark (`sbt/sbt clean compile`). * [Quick Start](quick-start.html): a quick introduction to the Spark API; start here! * [Spark Programming Guide](scala-programming-guide.html): an overview of Spark concepts, and details on the Scala API -* [Java Programming Guide](java-programming-guide.html): using Spark from Java -* [Python Programming Guide](python-programming-guide.html): using Spark from Python -* [Spark Streaming Guide](streaming-programming-guide.html): using the alpha release of Spark Streaming + * [Java Programming Guide](java-programming-guide.html): using Spark from Java + * [Python Programming Guide](python-programming-guide.html): using Spark from Python +* [Spark Streaming](streaming-programming-guide.html): using the alpha release of Spark Streaming +* [MLlib (Machine Learning)](mllib-programming-guide.html): Spark's built-in machine learning library +* [Bagel (Pregel on Spark)](bagel-programming-guide.html): simple graph processing model **API Docs:** -* [Spark Java/Scala (Scaladoc)](api/core/index.html) -* [Spark Python (Epydoc)](api/pyspark/index.html) -* [Spark Streaming Java/Scala (Scaladoc)](api/streaming/index.html) +* [Spark for Java/Scala (Scaladoc)](api/core/index.html) +* [Spark for Python (Epydoc)](api/pyspark/index.html) +* [Spark Streaming for Java/Scala (Scaladoc)](api/streaming/index.html) +* [MLlib (Machine Learning) for Java/Scala (Scaladoc)](api/mllib/index.html) +* [Bagel (Pregel on Spark) for Scala (Scaladoc)](api/bagel/index.html) + **Deployment guides:** @@ -74,27 +85,27 @@ of `project/SparkBuild.scala`, then rebuilding Spark (`sbt/sbt clean compile`). **Other documents:** -* [Building Spark With Maven](building-with-maven.html): Build Spark using the Maven build tool * [Configuration](configuration.html): customize Spark via its configuration system * [Tuning Guide](tuning.html): best practices to optimize performance and memory use -* [Bagel](bagel-programming-guide.html): an implementation of Google's Pregel on Spark +* [Hardware Provisioning](hardware-provisioning.html): recommendations for cluster hardware +* [Building Spark with Maven](building-with-maven.html): Build Spark using the Maven build tool * [Contributing to Spark](contributing-to-spark.html) **External resources:** -* [Spark Homepage](http://www.spark-project.org) -* [Mailing List](http://groups.google.com/group/spark-users): ask questions about Spark here -* [AMP Camp](http://ampcamp.berkeley.edu/): a two-day training camp at UC Berkeley that featured talks and exercises - about Spark, Shark, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/agenda-2012), +* [Spark Homepage](http://spark.incubator.apache.org) +* [Mailing Lists](http://spark.incubator.apache.org/mailing-lists.html): ask questions about Spark here +* [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC Berkeley that featured talks and + exercises about Spark, Shark, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/agenda-2012), [slides](http://ampcamp.berkeley.edu/agenda-2012) and [exercises](http://ampcamp.berkeley.edu/exercises-2012) are available online for free. -* [Code Examples](http://spark-project.org/examples.html): more are also available in the [examples subfolder](https://github.com/mesos/spark/tree/master/examples/src/main/scala/spark/examples) of Spark +* [Code Examples](http://spark.incubator.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/mesos/spark/tree/master/examples/src/main/scala/spark/examples) of Spark * [Paper Describing Spark](http://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf) * [Paper Describing Spark Streaming](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf) # Community -To get help using Spark or keep up with Spark development, sign up for the [spark-users mailing list](http://groups.google.com/group/spark-users). +To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.incubator.apache.org/mailing-lists.html). If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users. diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 678cd57aba..fe5334ffdc 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -3,50 +3,33 @@ layout: global title: Launching Spark on YARN --- -Experimental support for running over a [YARN (Hadoop +Support for running on [YARN (Hadoop NextGen)](http://hadoop.apache.org/docs/r2.0.2-alpha/hadoop-yarn/hadoop-yarn-site/YARN.html) -cluster was added to Spark in version 0.6.0. This was merged into master as part of 0.7 effort. -To build spark with YARN support, please use the hadoop2-yarn profile. -Ex: mvn -Phadoop2-yarn clean install +was added to Spark in version 0.6.0, and improved in 0.7.0 and 0.8.0. -# Building spark core consolidated jar. +# Building a YARN-Enabled Assembly JAR -We need a consolidated spark core jar (which bundles all the required dependencies) to run Spark jobs on a yarn cluster. -This can be built either through sbt or via maven. +We need a consolidated Spark JAR (which bundles all the required dependencies) to run Spark jobs on a YARN cluster. +This can be built by setting the Hadoop version and `SPARK_YARN` environment variable, as follows: -- Building spark assembled jar via sbt. -Enable YARN support by setting `SPARK_YARN=true` when invoking sbt: + SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true ./sbt/sbt assembly - SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true ./sbt/sbt clean assembly - -The assembled jar would typically be something like : -`./yarn/target/spark-yarn-assembly-0.8.0-SNAPSHOT.jar` - - -- Building spark assembled jar via Maven. - Use the hadoop2-yarn profile and execute the package target. - -Something like this. Ex: - - mvn -Phadoop2-yarn -Dhadoop.version=2.0.5-alpha clean package -DskipTests=true - - -This will build the shaded (consolidated) jar. Typically something like : -`./yarn/target/spark-yarn-bin--shaded.jar` +The assembled JAR will be something like this: +`./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly_{{site.SPARK_VERSION}}-hadoop2.0.5.jar`. # Preparations -- Building spark-yarn assembly (see above). +- Building a YARN-enabled assembly (see above). - Your application code must be packaged into a separate JAR file. -If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt package`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different. +If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different. # Configuration Most of the configs are the same for Spark on YARN as other deploys. See the Configuration page for more information on those. These are configs that are specific to SPARK on YARN. -* `SPARK_YARN_USER_ENV`, to add environment variables to the Spark processes launched on YARN. This can be a comma separated list of environment variables. ie SPARK_YARN_USER_ENV="JAVA_HOME=/jdk64,FOO=bar" +* `SPARK_YARN_USER_ENV`, to add environment variables to the Spark processes launched on YARN. This can be a comma separated list of environment variables, e.g. `SPARK_YARN_USER_ENV="JAVA_HOME=/jdk64,FOO=bar"`. # Launching Spark on YARN diff --git a/pyspark b/pyspark index 2dba2ceb21..4941a36d0d 100755 --- a/pyspark +++ b/pyspark @@ -31,7 +31,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null if [[ $? != 0 ]]; then echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2 - echo "You need to compile Spark before running this program" >&2 + echo "You need to build Spark with sbt/sbt assembly before running this program" >&2 exit 1 fi fi diff --git a/run-example b/run-example index ccd4356bdf..24d83ba5cf 100755 --- a/run-example +++ b/run-example @@ -50,7 +50,7 @@ if [ -e "$EXAMPLES_DIR"/target/spark-examples*[0-9T].jar ]; then fi if [[ -z $SPARK_EXAMPLES_JAR ]]; then echo "Failed to find Spark examples assembly in $FWDIR/examples/target" >&2 - echo "You need to compile Spark before running this program" >&2 + echo "You need to build Spark with sbt/sbt assembly before running this program" >&2 exit 1 fi diff --git a/spark-class b/spark-class index 5ef3de9773..244b78b4e1 100755 --- a/spark-class +++ b/spark-class @@ -102,10 +102,10 @@ export JAVA_OPTS if [ ! -f "$FWDIR/RELEASE" ]; then # Exit if the user hasn't compiled Spark - ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*.jar >& /dev/null + ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar >& /dev/null if [[ $? != 0 ]]; then echo "Failed to find Spark assembly in $FWDIR/assembly/target" >&2 - echo "You need to compile Spark before running this program" >&2 + echo "You need to build Spark with sbt/sbt assembly before running this program" >&2 exit 1 fi fi -- cgit v1.2.3