diff options
30 files changed, 77 insertions, 55 deletions
diff --git a/bagel/pom.xml b/bagel/pom.xml index 510cff4669..be2e358091 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -4,7 +4,7 @@ <parent> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> diff --git a/bin/spark-daemon.sh b/bin/spark-daemon.sh index 0c584055c7..8ee3ec481f 100755 --- a/bin/spark-daemon.sh +++ b/bin/spark-daemon.sh @@ -30,7 +30,7 @@ # SPARK_NICENESS The scheduling priority for daemons. Defaults to 0. ## -usage="Usage: spark-daemon.sh [--config <conf-dir>] [--hosts hostlistfile] (start|stop) <spark-command> <args...>" +usage="Usage: spark-daemon.sh [--config <conf-dir>] [--hosts hostlistfile] (start|stop) <spark-command> <spark-instance-number> <args...>" # if no args specified, show usage if [ $# -le 1 ]; then @@ -48,6 +48,8 @@ startStop=$1 shift command=$1 shift +instance=$1 +shift spark_rotate_log () { @@ -92,10 +94,10 @@ if [ "$SPARK_PID_DIR" = "" ]; then fi # some variables -export SPARK_LOGFILE=spark-$SPARK_IDENT_STRING-$command-$HOSTNAME.log +export SPARK_LOGFILE=spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.log export SPARK_ROOT_LOGGER="INFO,DRFA" -log=$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$HOSTNAME.out -pid=$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command.pid +log=$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out +pid=$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid # Set default scheduling priority if [ "$SPARK_NICENESS" = "" ]; then diff --git a/bin/spark-daemons.sh b/bin/spark-daemons.sh index 4f9719ee80..0619097e4d 100755 --- a/bin/spark-daemons.sh +++ b/bin/spark-daemons.sh @@ -2,7 +2,7 @@ # Run a Spark command on all slave hosts. -usage="Usage: spark-daemons.sh [--config confdir] [--hosts hostlistfile] [start|stop] command args..." +usage="Usage: spark-daemons.sh [--config confdir] [--hosts hostlistfile] [start|stop] command instance-number args..." # if no args specified, show usage if [ $# -le 1 ]; then diff --git a/bin/start-master.sh b/bin/start-master.sh index 87feb261fe..83a3e1f3dc 100755 --- a/bin/start-master.sh +++ b/bin/start-master.sh @@ -32,4 +32,4 @@ if [ "$SPARK_PUBLIC_DNS" = "" ]; then fi fi -"$bin"/spark-daemon.sh start spark.deploy.master.Master --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT +"$bin"/spark-daemon.sh start spark.deploy.master.Master 1 --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT diff --git a/bin/start-slave.sh b/bin/start-slave.sh index 45a0cf7a6b..616c76e4ee 100755 --- a/bin/start-slave.sh +++ b/bin/start-slave.sh @@ -11,4 +11,4 @@ if [ "$SPARK_PUBLIC_DNS" = "" ]; then fi fi -"$bin"/spark-daemon.sh start spark.deploy.worker.Worker $1 +"$bin"/spark-daemon.sh start spark.deploy.worker.Worker "$@" diff --git a/bin/start-slaves.sh b/bin/start-slaves.sh index 390247ca4a..4e05224190 100755 --- a/bin/start-slaves.sh +++ b/bin/start-slaves.sh @@ -21,4 +21,13 @@ fi echo "Master IP: $SPARK_MASTER_IP" # Launch the slaves -exec "$bin/slaves.sh" cd "$SPARK_HOME" \; "$bin/start-slave.sh" spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT +if [ "$SPARK_WORKER_INSTANCES" = "" ]; then + exec "$bin/slaves.sh" cd "$SPARK_HOME" \; "$bin/start-slave.sh" 1 spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT +else + if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then + SPARK_WORKER_WEBUI_PORT=8081 + fi + for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do + "$bin/slaves.sh" cd "$SPARK_HOME" \; "$bin/start-slave.sh" $(( $i + 1 )) spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT --webui-port $(( $SPARK_WORKER_WEBUI_PORT + $i )) + done +fi diff --git a/bin/stop-master.sh b/bin/stop-master.sh index f75167dd2c..172ee5891d 100755 --- a/bin/stop-master.sh +++ b/bin/stop-master.sh @@ -7,4 +7,4 @@ bin=`cd "$bin"; pwd` . "$bin/spark-config.sh" -"$bin"/spark-daemon.sh stop spark.deploy.master.Master
\ No newline at end of file +"$bin"/spark-daemon.sh stop spark.deploy.master.Master 1 diff --git a/bin/stop-slaves.sh b/bin/stop-slaves.sh index 21c9ebf324..fbfc594472 100755 --- a/bin/stop-slaves.sh +++ b/bin/stop-slaves.sh @@ -7,4 +7,14 @@ bin=`cd "$bin"; pwd` . "$bin/spark-config.sh" -"$bin"/spark-daemons.sh stop spark.deploy.worker.Worker
\ No newline at end of file +if [ -f "${SPARK_CONF_DIR}/spark-env.sh" ]; then + . "${SPARK_CONF_DIR}/spark-env.sh" +fi + +if [ "$SPARK_WORKER_INSTANCES" = "" ]; then + "$bin"/spark-daemons.sh stop spark.deploy.worker.Worker 1 +else + for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do + "$bin"/spark-daemons.sh stop spark.deploy.worker.Worker $(( $i + 1 )) + done +fi diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index 6d71ec5691..37565ca827 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -12,6 +12,7 @@ # - SPARK_WORKER_CORES, to set the number of cores to use on this machine # - SPARK_WORKER_MEMORY, to set how much memory to use (e.g. 1000m, 2g) # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT +# - SPARK_WORKER_INSTANCES, to set the number of worker instances/processes to be spawned on every slave machine # # Finally, Spark also relies on the following variables, but these can be set # on just the *master* (i.e. in your driver program), and will automatically diff --git a/core/pom.xml b/core/pom.xml index fe9c803728..08717860a7 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -4,7 +4,7 @@ <parent> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index 33dc7627a3..ccd9d0364a 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -366,7 +366,7 @@ abstract class RDD[T: ClassManifest]( * Return a new RDD by applying a function to each partition of this RDD, while tracking the index * of the original partition. */ - @deprecated("use mapPartitionsWithIndex") + @deprecated("use mapPartitionsWithIndex", "0.7.0") def mapPartitionsWithSplit[U: ClassManifest]( f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = diff --git a/core/src/main/scala/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/spark/rdd/CheckpointRDD.scala index 9e37bdf659..5db77eb142 100644 --- a/core/src/main/scala/spark/rdd/CheckpointRDD.scala +++ b/core/src/main/scala/spark/rdd/CheckpointRDD.scala @@ -122,6 +122,6 @@ private[spark] object CheckpointRDD extends Logging { val cpRDD = new CheckpointRDD[Int](sc, path.toString) assert(cpRDD.partitions.length == rdd.partitions.length, "Number of partitions is not the same") assert(cpRDD.collect.toList == rdd.collect.toList, "Data of partitions not the same") - fs.delete(path) + fs.delete(path, true) } } diff --git a/core/src/main/scala/spark/rdd/ZippedRDD.scala b/core/src/main/scala/spark/rdd/ZippedRDD.scala index e80ec17aa5..35b0e06785 100644 --- a/core/src/main/scala/spark/rdd/ZippedRDD.scala +++ b/core/src/main/scala/spark/rdd/ZippedRDD.scala @@ -10,17 +10,17 @@ private[spark] class ZippedPartition[T: ClassManifest, U: ClassManifest]( @transient rdd2: RDD[U] ) extends Partition { - var split1 = rdd1.partitions(idx) - var split2 = rdd1.partitions(idx) + var partition1 = rdd1.partitions(idx) + var partition2 = rdd2.partitions(idx) override val index: Int = idx - def splits = (split1, split2) + def partitions = (partition1, partition2) @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream) { - // Update the reference to parent split at the time of task serialization - split1 = rdd1.partitions(idx) - split2 = rdd2.partitions(idx) + // Update the reference to parent partition at the time of task serialization + partition1 = rdd1.partitions(idx) + partition2 = rdd2.partitions(idx) oos.defaultWriteObject() } } @@ -43,13 +43,13 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest]( } override def compute(s: Partition, context: TaskContext): Iterator[(T, U)] = { - val (split1, split2) = s.asInstanceOf[ZippedPartition[T, U]].splits - rdd1.iterator(split1, context).zip(rdd2.iterator(split2, context)) + val (partition1, partition2) = s.asInstanceOf[ZippedPartition[T, U]].partitions + rdd1.iterator(partition1, context).zip(rdd2.iterator(partition2, context)) } override def getPreferredLocations(s: Partition): Seq[String] = { - val (split1, split2) = s.asInstanceOf[ZippedPartition[T, U]].splits - rdd1.preferredLocations(split1).intersect(rdd2.preferredLocations(split2)) + val (partition1, partition2) = s.asInstanceOf[ZippedPartition[T, U]].partitions + rdd1.preferredLocations(partition1).intersect(rdd2.preferredLocations(partition2)) } override def clearDependencies() { diff --git a/docs/_config.yml b/docs/_config.yml index f99d5bb376..5c135a0242 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -3,8 +3,8 @@ markdown: kramdown # These allow the documentation to be updated with nerw releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 0.7.1-SNAPSHOT -SPARK_VERSION_SHORT: 0.7.1 -SCALA_VERSION: 2.9.2 +SPARK_VERSION: 0.8.0-SNAPSHOT +SPARK_VERSION_SHORT: 0.8.0 +SCALA_VERSION: 2.9.3 MESOS_VERSION: 0.9.0-incubating SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md index c2eeafd07a..04cd79d039 100644 --- a/docs/building-with-maven.md +++ b/docs/building-with-maven.md @@ -42,10 +42,10 @@ To run a specific test suite: You might run into the following errors if you're using a vanilla installation of Maven: - [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/andyk/Development/spark/core/target/scala-2.9.2/classes... + [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_VERSION}}/classes... [ERROR] PermGen space -> [Help 1] - [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/andyk/Development/spark/core/target/scala-2.9.2/classes... + [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_VERSION}}/classes... [ERROR] Java heap space -> [Help 1] To fix these, you can do the following: diff --git a/docs/index.md b/docs/index.md index 51d505e1fa..0c4add45dc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,7 +18,7 @@ or you will need to set the `SCALA_HOME` environment variable to point to where you've installed Scala. Scala must also be accessible through one of these methods on slave nodes on your cluster. -Spark uses [Simple Build Tool](https://github.com/harrah/xsbt/wiki), which is bundled with it. To compile the code, go into the top-level Spark directory and run +Spark uses [Simple Build Tool](http://www.scala-sbt.org), which is bundled with it. To compile the code, go into the top-level Spark directory and run sbt/sbt package diff --git a/examples/pom.xml b/examples/pom.xml index 39cc47c709..270777e29c 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -4,7 +4,7 @@ <parent> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> @@ -22,7 +22,7 @@ <dependency> <groupId>com.twitter</groupId> <artifactId>algebird-core_2.9.2</artifactId> - <version>0.1.8</version> + <version>0.1.11</version> </dependency> <dependency> <groupId>org.scalatest</groupId> diff --git a/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala b/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala index 483aae452b..a9642100e3 100644 --- a/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala +++ b/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala @@ -49,7 +49,7 @@ object TwitterAlgebirdCMS { val users = stream.map(status => status.getUser.getId) - val cms = new CountMinSketchMonoid(DELTA, EPS, SEED, PERC) + val cms = new CountMinSketchMonoid(EPS, DELTA, SEED, PERC) var globalCMS = cms.zero val mm = new MapMonoid[Long, Int]() var globalExact = Map[Long, Int]() @@ -3,7 +3,7 @@ <modelVersion>4.0.0</modelVersion> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <packaging>pom</packaging> <name>Spark Project Parent POM</name> <url>http://spark-project.org/</url> diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 5f378b2398..f03efd6c83 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -35,8 +35,8 @@ object SparkBuild extends Build { def sharedSettings = Defaults.defaultSettings ++ Seq( organization := "org.spark-project", - version := "0.7.1-SNAPSHOT", - scalaVersion := "2.9.2", + version := "0.8.0-SNAPSHOT", + scalaVersion := "2.9.3", scalacOptions := Seq("-unchecked", "-optimize", "-deprecation"), unmanagedJars in Compile <<= baseDirectory map { base => (base / "lib" ** "*.jar").classpath }, retrieveManaged := true, @@ -92,10 +92,10 @@ object SparkBuild extends Build { */ libraryDependencies ++= Seq( - "org.eclipse.jetty" % "jetty-server" % "7.5.3.v20111011", - "org.scalatest" %% "scalatest" % "1.8" % "test", - "org.scalacheck" %% "scalacheck" % "1.9" % "test", - "com.novocode" % "junit-interface" % "0.8" % "test", + "org.eclipse.jetty" % "jetty-server" % "7.6.8.v20121106", + "org.scalatest" %% "scalatest" % "1.9.1" % "test", + "org.scalacheck" %% "scalacheck" % "1.10.0" % "test", + "com.novocode" % "junit-interface" % "0.9" % "test", "org.easymock" % "easymock" % "3.1" % "test" ), parallelExecution := false, @@ -140,7 +140,7 @@ object SparkBuild extends Build { "colt" % "colt" % "1.2.0", "cc.spray" % "spray-can" % "1.0-M2.1", "cc.spray" % "spray-server" % "1.0-M2.1", - "cc.spray" %% "spray-json" % "1.1.1", + "cc.spray" % "spray-json_2.9.2" % "1.1.1", "org.apache.mesos" % "mesos" % "0.9.0-incubating" ) ++ (if (HADOOP_MAJOR_VERSION == "2") Some("org.apache.hadoop" % "hadoop-client" % HADOOP_VERSION) else None).toSeq, unmanagedSourceDirectories in Compile <+= baseDirectory{ _ / ("src/hadoop" + HADOOP_MAJOR_VERSION + "/scala") } @@ -157,7 +157,7 @@ object SparkBuild extends Build { def examplesSettings = sharedSettings ++ Seq( name := "spark-examples", - libraryDependencies ++= Seq("com.twitter" % "algebird-core_2.9.2" % "0.1.8") + libraryDependencies ++= Seq("com.twitter" % "algebird-core_2.9.2" % "0.1.11") ) def bagelSettings = sharedSettings ++ Seq(name := "spark-bagel") diff --git a/project/build.properties b/project/build.properties index d4287112c6..66ad72ce2e 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.11.3 +sbt.version=0.12.2 diff --git a/project/plugins.sbt b/project/plugins.sbt index 4d0e696a11..d4f2442872 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -4,13 +4,13 @@ resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/release resolvers += "Spray Repository" at "http://repo.spray.cc/" -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.8.3") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.8.5") -addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.1.0-RC1") +addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.1.1") -addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.0.0") +addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.2.0") -addSbtPlugin("cc.spray" %% "sbt-twirl" % "0.5.2") +addSbtPlugin("io.spray" %% "sbt-twirl" % "0.6.1") // For Sonatype publishing //resolvers += Resolver.url("sbt-plugin-releases", new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases/"))(Resolver.ivyStylePatterns) diff --git a/python/examples/transitive_closure.py b/python/examples/transitive_closure.py index 73f7f8fbaf..7f85a1008e 100644 --- a/python/examples/transitive_closure.py +++ b/python/examples/transitive_closure.py @@ -24,7 +24,7 @@ if __name__ == "__main__": "Usage: PythonTC <master> [<slices>]" exit(-1) sc = SparkContext(sys.argv[1], "PythonTC") - slices = sys.argv[2] if len(sys.argv) > 2 else 2 + slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2 tc = sc.parallelize(generateGraph(), slices).cache() # Linear transitive closure: each round grows paths by one edge, diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 172ed85fab..a9fec17a9d 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -399,7 +399,7 @@ class RDD(object): >>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name) >>> from fileinput import input >>> from glob import glob - >>> ''.join(input(glob(tempFile.name + "/part-0000*"))) + >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*")))) '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n' """ def func(split, iterator): diff --git a/repl-bin/pom.xml b/repl-bin/pom.xml index dd720e2291..fe526a7616 100644 --- a/repl-bin/pom.xml +++ b/repl-bin/pom.xml @@ -4,7 +4,7 @@ <parent> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> diff --git a/repl/pom.xml b/repl/pom.xml index a3e4606edc..0b5e400c3d 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -4,7 +4,7 @@ <parent> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> diff --git a/repl/src/main/scala/spark/repl/SparkILoop.scala b/repl/src/main/scala/spark/repl/SparkILoop.scala index cd7b5128b2..23556dbc8f 100644 --- a/repl/src/main/scala/spark/repl/SparkILoop.scala +++ b/repl/src/main/scala/spark/repl/SparkILoop.scala @@ -200,7 +200,7 @@ class SparkILoop(in0: Option[BufferedReader], val out: PrintWriter, val master: ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /___/ .__/\_,_/_/ /_/\_\ version 0.7.1 + /___/ .__/\_,_/_/ /_/\_\ version 0.8.0 /_/ """) import Properties._ @@ -1,6 +1,6 @@ #!/bin/bash -SCALA_VERSION=2.9.2 +SCALA_VERSION=2.9.3 # Figure out where the Scala framework is installed FWDIR="$(cd `dirname $0`; pwd)" @@ -1,6 +1,6 @@ @echo off -set SCALA_VERSION=2.9.2 +set SCALA_VERSION=2.9.3 rem Figure out where the Spark framework is installed set FWDIR=%~dp0 diff --git a/streaming/pom.xml b/streaming/pom.xml index ec077e8089..b0d0cd0ff3 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -4,7 +4,7 @@ <parent> <groupId>org.spark-project</groupId> <artifactId>spark-parent</artifactId> - <version>0.7.1-SNAPSHOT</version> + <version>0.8.0-SNAPSHOT</version> <relativePath>../pom.xml</relativePath> </parent> |