From f0b68c623c116540470e06967c1554855d16a500 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 5 Feb 2013 19:02:46 -0800 Subject: Initial cut at replacing K, V in Java files --- .../spark/streaming/api/java/JavaDStreamLike.scala | 4 +- .../test/java/spark/streaming/JavaAPISuite.java | 56 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) (limited to 'streaming') diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index b93cb7865a..39fe0d0ccc 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -59,8 +59,8 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable } /** Return a new DStream by applying a function to all elements of this DStream. */ - def map[K, V](f: PairFunction[T, K, V]): JavaPairDStream[K, V] = { - def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K, V]]] + def map[K2, V2](f: PairFunction[T, K2, V2]): JavaPairDStream[K2, V2] = { + def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K2, V2]]] new JavaPairDStream(dstream.map(f)(cm))(f.keyType(), f.valueType()) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 79d6093429..26ac82b71a 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -506,6 +506,62 @@ public class JavaAPISuite implements Serializable { new Tuple2("new york", 3), new Tuple2("new york", 1))); + @Test + public void testPairMap() { // Maps pair -> pair + List>> inputData = stringIntKVStream; + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, "california"), + new Tuple2(3, "california"), + new Tuple2(4, "new york"), + new Tuple2(1, "new york")), + Arrays.asList( + new Tuple2(5, "california"), + new Tuple2(5, "california"), + new Tuple2(3, "new york"), + new Tuple2(1, "new york"))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaPairDStream reversed = pairStream.map( + new PairFunction, Integer, String>() { + @Override + public Tuple2 call(Tuple2 in) throws Exception { + return new Tuple2(in._2(), in._1()); + } + }); + + JavaTestUtils.attachTestOutputStream(reversed); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + + @Test + public void testPairMap2() { // Maps pair -> single + List>> inputData = stringIntKVStream; + + List> expected = Arrays.asList( + Arrays.asList(1, 3, 4, 1), + Arrays.asList(5, 5, 3, 1)); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaDStream reversed = pairStream.map( + new Function, Integer>() { + @Override + public Integer call(Tuple2 in) throws Exception { + return in._2(); + } + }); + + JavaTestUtils.attachTestOutputStream(reversed); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testPairGroupByKey() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From 314d87a038d84c4ae9a6471ea19a5431153ea604 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 09:20:37 -0800 Subject: Indentation fix --- .../src/test/java/spark/streaming/JavaAPISuite.java | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'streaming') diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 26ac82b71a..4cf9d115ae 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -511,16 +511,16 @@ public class JavaAPISuite implements Serializable { List>> inputData = stringIntKVStream; List>> expected = Arrays.asList( - Arrays.asList( - new Tuple2(1, "california"), - new Tuple2(3, "california"), - new Tuple2(4, "new york"), - new Tuple2(1, "new york")), - Arrays.asList( - new Tuple2(5, "california"), - new Tuple2(5, "california"), - new Tuple2(3, "new york"), - new Tuple2(1, "new york"))); + Arrays.asList( + new Tuple2(1, "california"), + new Tuple2(3, "california"), + new Tuple2(4, "new york"), + new Tuple2(1, "new york")), + Arrays.asList( + new Tuple2(5, "california"), + new Tuple2(5, "california"), + new Tuple2(3, "new york"), + new Tuple2(1, "new york"))); JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); -- cgit v1.2.3 From 20cf77054536acd9c064d6e7ffedce23a87fb6a5 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 09:21:06 -0800 Subject: Fix for flatmap --- .../spark/streaming/api/java/JavaDStreamLike.scala | 4 +-- .../test/java/spark/streaming/JavaAPISuite.java | 42 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'streaming') diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index 39fe0d0ccc..9cc263930e 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -78,10 +78,10 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable * Return a new DStream by applying a function to all elements of this DStream, * and then flattening the results */ - def flatMap[K, V](f: PairFlatMapFunction[T, K, V]): JavaPairDStream[K, V] = { + def flatMap[K2, V2](f: PairFlatMapFunction[T, K2, V2]): JavaPairDStream[K2, V2] = { import scala.collection.JavaConverters._ def fn = (x: T) => f.apply(x).asScala - def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K, V]]] + def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K2, V2]]] new JavaPairDStream(dstream.flatMap(fn)(cm))(f.keyType(), f.valueType()) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 4cf9d115ae..ec4e5ae18b 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -562,6 +562,48 @@ public class JavaAPISuite implements Serializable { Assert.assertEquals(expected, result); } + @Test + public void testPairToPairFlatMapWithChangingTypes() { // Maps pair -> pair + List>> inputData = Arrays.asList( + Arrays.asList( + new Tuple2("hi", 1), + new Tuple2("ho", 2)), + Arrays.asList( + new Tuple2("hi", 1), + new Tuple2("ho", 2))); + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, "h"), + new Tuple2(1, "i"), + new Tuple2(2, "h"), + new Tuple2(2, "o")), + Arrays.asList( + new Tuple2(1, "h"), + new Tuple2(1, "i"), + new Tuple2(2, "h"), + new Tuple2(2, "o"))); + + JavaDStream> stream = + JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaPairDStream flatMapped = pairStream.flatMap( + new PairFlatMapFunction, Integer, String>() { + @Override + public Iterable> call(Tuple2 in) throws Exception { + List> out = new LinkedList>(); + for (Character s: in._1().toCharArray()) { + out.add(new Tuple2(in._2(), s.toString())); + } + return out; + } + }); + JavaTestUtils.attachTestOutputStream(flatMapped); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testPairGroupByKey() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From c65988bdc1b75e88e6df77df0b84fc3a34c5b028 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 09:51:57 -0800 Subject: Fix for MapPartitions --- .../spark/streaming/api/java/JavaDStreamLike.scala | 4 +- .../test/java/spark/streaming/JavaAPISuite.java | 67 +++++++++++++++++----- 2 files changed, 54 insertions(+), 17 deletions(-) (limited to 'streaming') diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index 9cc263930e..ec546c8190 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -100,8 +100,8 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable * of this DStream. Applying mapPartitions() to an RDD applies a function to each partition * of the RDD. */ - def mapPartitions[K, V](f: PairFlatMapFunction[java.util.Iterator[T], K, V]) - : JavaPairDStream[K, V] = { + def mapPartitions[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2]) + : JavaPairDStream[K2, V2] = { def fn = (x: Iterator[T]) => asScalaIterator(f.apply(asJavaIterator(x)).iterator()) new JavaPairDStream(dstream.mapPartitions(fn))(f.keyType(), f.valueType()) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index ec4e5ae18b..67d82d546f 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -507,7 +507,7 @@ public class JavaAPISuite implements Serializable { new Tuple2("new york", 1))); @Test - public void testPairMap() { // Maps pair -> pair + public void testPairMap() { // Maps pair -> pair of different type List>> inputData = stringIntKVStream; List>> expected = Arrays.asList( @@ -538,6 +538,43 @@ public class JavaAPISuite implements Serializable { Assert.assertEquals(expected, result); } + @Test + public void testPairMapPartitions() { // Maps pair -> pair of different type + List>> inputData = stringIntKVStream; + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, "california"), + new Tuple2(3, "california"), + new Tuple2(4, "new york"), + new Tuple2(1, "new york")), + Arrays.asList( + new Tuple2(5, "california"), + new Tuple2(5, "california"), + new Tuple2(3, "new york"), + new Tuple2(1, "new york"))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaPairDStream reversed = pairStream.mapPartitions( + new PairFlatMapFunction>, Integer, String>() { + @Override + public Iterable> call(Iterator> in) throws Exception { + LinkedList> out = new LinkedList>(); + while (in.hasNext()) { + Tuple2 next = in.next(); + out.add(new Tuple2(next._2(), next._1())); + } + return out; + } + }); + + JavaTestUtils.attachTestOutputStream(reversed); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testPairMap2() { // Maps pair -> single List>> inputData = stringIntKVStream; @@ -588,16 +625,16 @@ public class JavaAPISuite implements Serializable { JavaTestUtils.attachTestInputStream(ssc, inputData, 1); JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream flatMapped = pairStream.flatMap( - new PairFlatMapFunction, Integer, String>() { - @Override - public Iterable> call(Tuple2 in) throws Exception { - List> out = new LinkedList>(); - for (Character s: in._1().toCharArray()) { - out.add(new Tuple2(in._2(), s.toString())); - } - return out; - } - }); + new PairFlatMapFunction, Integer, String>() { + @Override + public Iterable> call(Tuple2 in) throws Exception { + List> out = new LinkedList>(); + for (Character s : in._1().toCharArray()) { + out.add(new Tuple2(in._2(), s.toString())); + } + return out; + } + }); JavaTestUtils.attachTestOutputStream(flatMapped); List>> result = JavaTestUtils.runStreams(ssc, 2, 2); @@ -668,7 +705,7 @@ public class JavaAPISuite implements Serializable { JavaPairDStream combined = pairStream.combineByKey( new Function() { - @Override + @Override public Integer call(Integer i) throws Exception { return i; } @@ -766,19 +803,19 @@ public class JavaAPISuite implements Serializable { JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream updated = pairStream.updateStateByKey( - new Function2, Optional, Optional>(){ + new Function2, Optional, Optional>() { @Override public Optional call(List values, Optional state) { int out = 0; if (state.isPresent()) { out = out + state.get(); } - for (Integer v: values) { + for (Integer v : values) { out = out + v; } return Optional.of(out); } - }); + }); JavaTestUtils.attachTestOutputStream(updated); List>> result = JavaTestUtils.runStreams(ssc, 3, 3); -- cgit v1.2.3 From 04786d07391c4052d6dc42ff0828a79a37bbbfdf Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 10:05:49 -0800 Subject: small fix --- streaming/src/test/java/spark/streaming/JavaAPISuite.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'streaming') diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 67d82d546f..551d4f15e4 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -810,12 +810,12 @@ public class JavaAPISuite implements Serializable { if (state.isPresent()) { out = out + state.get(); } - for (Integer v : values) { + for (Integer v: values) { out = out + v; } return Optional.of(out); } - }); + }); JavaTestUtils.attachTestOutputStream(updated); List>> result = JavaTestUtils.runStreams(ssc, 3, 3); -- cgit v1.2.3 From d09c36065ca040044530a50f0392c92866b6d301 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 10:45:45 -0800 Subject: Using tuple swap() --- streaming/src/test/java/spark/streaming/JavaAPISuite.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'streaming') diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 551d4f15e4..9bfcd83e4d 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -528,7 +528,7 @@ public class JavaAPISuite implements Serializable { new PairFunction, Integer, String>() { @Override public Tuple2 call(Tuple2 in) throws Exception { - return new Tuple2(in._2(), in._1()); + return in.swap(); } }); @@ -563,7 +563,7 @@ public class JavaAPISuite implements Serializable { LinkedList> out = new LinkedList>(); while (in.hasNext()) { Tuple2 next = in.next(); - out.add(new Tuple2(next._2(), next._1())); + out.add(next.swap()); } return out; } -- cgit v1.2.3 From 3f3e77f28b08fc1db110c3b14b2c90eaa6dca8ef Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 12 Feb 2013 13:57:57 -0800 Subject: STREAMING-50: Support transform workaround in JavaPairDStream This ports a useful workaround (the `transform` function) to JavaPairDStream. It is necessary to do things like sorting which are not supported yet in the core streaming API. --- .../spark/streaming/api/java/JavaPairDStream.scala | 34 +++++++++++++++- .../test/java/spark/streaming/JavaAPISuite.java | 45 ++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) (limited to 'streaming') diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala index ef10c091ca..eb2495e3ac 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala @@ -8,11 +8,11 @@ import scala.collection.JavaConversions._ import spark.streaming._ import spark.streaming.StreamingContext._ import spark.api.java.function.{Function => JFunction, Function2 => JFunction2} -import spark.Partitioner +import spark.{RDD, Partitioner} import org.apache.hadoop.mapred.{JobConf, OutputFormat} import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} import org.apache.hadoop.conf.Configuration -import spark.api.java.JavaPairRDD +import spark.api.java.{JavaRDD, JavaPairRDD} import spark.storage.StorageLevel import com.google.common.base.Optional @@ -81,6 +81,36 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( def union(that: JavaPairDStream[K, V]): JavaPairDStream[K, V] = dstream.union(that.dstream) + /** + * Return a new DStream in which each RDD is generated by applying a function + * on each RDD of this DStream. + */ + def transform[K2, V2](transformFunc: JFunction[JavaPairRDD[K, V], JavaPairRDD[K2, V2]]): + JavaPairDStream[K2, V2] = { + implicit val cmk: ClassManifest[K2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K2]] + implicit val cmv: ClassManifest[V2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V2]] + def scalaTransform (in: RDD[(K, V)]): RDD[(K2, V2)] = + transformFunc.call(new JavaPairRDD[K, V](in)).rdd + dstream.transform(scalaTransform(_)) + } + + /** + * Return a new DStream in which each RDD is generated by applying a function + * on each RDD of this DStream. + */ + def transform[K2, V2](transformFunc: JFunction2[JavaPairRDD[K, V], Time, JavaPairRDD[K2, V2]]): + JavaPairDStream[K2, V2] = { + implicit val cmk: ClassManifest[K2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K2]] + implicit val cmv: ClassManifest[V2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V2]] + def scalaTransform (in: RDD[(K, V)], time: Time): RDD[(K2, V2)] = + transformFunc.call(new JavaPairRDD[K, V](in), time).rdd + dstream.transform(scalaTransform(_, _)) + } + // ======================================================================= // Methods only for PairDStream's // ======================================================================= diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 9bfcd83e4d..7b385f609d 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -11,6 +11,7 @@ import org.junit.Before; import org.junit.Test; import scala.Tuple2; import spark.HashPartitioner; +import spark.api.java.JavaPairRDD; import spark.api.java.JavaRDD; import spark.api.java.JavaSparkContext; import spark.api.java.function.*; @@ -872,6 +873,50 @@ public class JavaAPISuite implements Serializable { Assert.assertEquals(expected, result); } + @Test + public void testPairTransform() { + List>> inputData = Arrays.asList( + Arrays.asList( + new Tuple2(3, 5), + new Tuple2(1, 5), + new Tuple2(4, 5), + new Tuple2(2, 5)), + Arrays.asList( + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5), + new Tuple2(1, 5))); + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, 5), + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5)), + Arrays.asList( + new Tuple2(1, 5), + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream( + ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + + JavaPairDStream sorted = pairStream.transform( + new Function, JavaPairRDD>() { + @Override + public JavaPairRDD call(JavaPairRDD in) throws Exception { + return in.sortByKey(); + } + }); + + JavaTestUtils.attachTestOutputStream(sorted); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testMapValues() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From ae2234687d9040b42619c374eadfd40c896d386d Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 16 Feb 2013 13:10:31 -0600 Subject: Make CoGroupedRDDs explicitly have the same key type. --- core/src/main/scala/spark/PairRDDFunctions.scala | 8 ++++---- core/src/main/scala/spark/rdd/CoGroupedRDD.scala | 4 ++-- core/src/test/scala/spark/CheckpointSuite.scala | 2 +- .../src/main/scala/spark/streaming/PairDStreamFunctions.scala | 2 +- .../src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala | 2 +- .../scala/spark/streaming/dstream/ReducedWindowedDStream.scala | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'streaming') diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala index cc3cca2571..36b9880cd1 100644 --- a/core/src/main/scala/spark/PairRDDFunctions.scala +++ b/core/src/main/scala/spark/PairRDDFunctions.scala @@ -361,7 +361,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( throw new SparkException("Default partitioner cannot partition array keys.") } val cg = new CoGroupedRDD[K]( - Seq(self.asInstanceOf[RDD[(_, _)]], other.asInstanceOf[RDD[(_, _)]]), + Seq(self.asInstanceOf[RDD[(K, _)]], other.asInstanceOf[RDD[(K, _)]]), partitioner) val prfs = new PairRDDFunctions[K, Seq[Seq[_]]](cg)(classManifest[K], Manifests.seqSeqManifest) prfs.mapValues { @@ -380,9 +380,9 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( throw new SparkException("Default partitioner cannot partition array keys.") } val cg = new CoGroupedRDD[K]( - Seq(self.asInstanceOf[RDD[(_, _)]], - other1.asInstanceOf[RDD[(_, _)]], - other2.asInstanceOf[RDD[(_, _)]]), + Seq(self.asInstanceOf[RDD[(K, _)]], + other1.asInstanceOf[RDD[(K, _)]], + other2.asInstanceOf[RDD[(K, _)]]), partitioner) val prfs = new PairRDDFunctions[K, Seq[Seq[_]]](cg)(classManifest[K], Manifests.seqSeqManifest) prfs.mapValues { diff --git a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala index 0a1e2cbee0..868ee5a39f 100644 --- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala @@ -40,8 +40,8 @@ private[spark] class CoGroupAggregator { (b1, b2) => b1 ++ b2 }) with Serializable -class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) - extends RDD[(K, Seq[Seq[_]])](rdds.head.context, Nil) with Logging { +class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(K, _)]], part: Partitioner) + extends RDD[(K, Seq[Seq[_]])](rdds.head.context, Nil) { private val aggr = new CoGroupAggregator diff --git a/core/src/test/scala/spark/CheckpointSuite.scala b/core/src/test/scala/spark/CheckpointSuite.scala index 0d08fd2396..51ff966ae4 100644 --- a/core/src/test/scala/spark/CheckpointSuite.scala +++ b/core/src/test/scala/spark/CheckpointSuite.scala @@ -347,7 +347,7 @@ object CheckpointSuite { def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner) = { //println("First = " + first + ", second = " + second) new CoGroupedRDD[K]( - Seq(first.asInstanceOf[RDD[(_, _)]], second.asInstanceOf[RDD[(_, _)]]), + Seq(first.asInstanceOf[RDD[(K, _)]], second.asInstanceOf[RDD[(K, _)]]), part ).asInstanceOf[RDD[(K, Seq[Seq[V]])]] } diff --git a/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala b/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala index fbcf061126..5db3844f1d 100644 --- a/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala +++ b/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala @@ -457,7 +457,7 @@ extends Serializable { ): DStream[(K, (Seq[V], Seq[W]))] = { val cgd = new CoGroupedDStream[K]( - Seq(self.asInstanceOf[DStream[(_, _)]], other.asInstanceOf[DStream[(_, _)]]), + Seq(self.asInstanceOf[DStream[(K, _)]], other.asInstanceOf[DStream[(K, _)]]), partitioner ) val pdfs = new PairDStreamFunctions[K, Seq[Seq[_]]](cgd)( diff --git a/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala index ddb1bf6b28..4ef4bb7de1 100644 --- a/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala +++ b/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala @@ -6,7 +6,7 @@ import spark.streaming.{Time, DStream, Duration} private[streaming] class CoGroupedDStream[K : ClassManifest]( - parents: Seq[DStream[(_, _)]], + parents: Seq[DStream[(K, _)]], partitioner: Partitioner ) extends DStream[(K, Seq[Seq[_]])](parents.head.ssc) { diff --git a/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala index 733d5c4a25..263655039c 100644 --- a/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala +++ b/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala @@ -101,7 +101,7 @@ class ReducedWindowedDStream[K: ClassManifest, V: ClassManifest]( val allRDDs = new ArrayBuffer[RDD[(K, V)]]() += previousWindowRDD ++= oldRDDs ++= newRDDs // Cogroup the reduced RDDs and merge the reduced values - val cogroupedRDD = new CoGroupedRDD[K](allRDDs.toSeq.asInstanceOf[Seq[RDD[(_, _)]]], partitioner) + val cogroupedRDD = new CoGroupedRDD[K](allRDDs.toSeq.asInstanceOf[Seq[RDD[(K, _)]]], partitioner) //val mergeValuesFunc = mergeValues(oldRDDs.size, newRDDs.size) _ val numOldValues = oldRDDs.size -- cgit v1.2.3 From 7151e1e4c8f4f764c54047ef82b988f887a0b9c7 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 17 Feb 2013 23:23:08 -0800 Subject: Rename "jobs" to "applications" in the standalone cluster --- core/src/main/scala/spark/SparkContext.scala | 12 +- .../scala/spark/api/java/JavaSparkContext.scala | 22 +-- .../scala/spark/api/python/PythonPartitioner.scala | 2 +- .../spark/deploy/ApplicationDescription.scala | 14 ++ .../main/scala/spark/deploy/DeployMessage.scala | 19 +-- .../main/scala/spark/deploy/JobDescription.scala | 14 -- .../src/main/scala/spark/deploy/JsonProtocol.scala | 18 +-- .../main/scala/spark/deploy/client/Client.scala | 22 +-- .../scala/spark/deploy/client/ClientListener.scala | 2 +- .../scala/spark/deploy/client/TestClient.scala | 6 +- .../spark/deploy/master/ApplicationInfo.scala | 63 ++++++++ .../spark/deploy/master/ApplicationState.scala | 11 ++ .../scala/spark/deploy/master/ExecutorInfo.scala | 4 +- .../main/scala/spark/deploy/master/JobInfo.scala | 63 -------- .../main/scala/spark/deploy/master/JobState.scala | 9 -- .../main/scala/spark/deploy/master/Master.scala | 174 ++++++++++----------- .../scala/spark/deploy/master/MasterWebUI.scala | 22 +-- .../scala/spark/deploy/master/WorkerInfo.scala | 4 +- .../scala/spark/deploy/worker/ExecutorRunner.scala | 26 +-- .../main/scala/spark/deploy/worker/Worker.scala | 20 +-- .../spark/deploy/worker/WorkerArguments.scala | 2 +- .../scala/spark/deploy/worker/WorkerWebUI.scala | 4 +- .../cluster/SparkDeploySchedulerBackend.scala | 15 +- .../mesos/CoarseMesosSchedulerBackend.scala | 4 +- .../scheduler/mesos/MesosSchedulerBackend.scala | 4 +- .../spark/deploy/master/app_details.scala.html | 40 +++++ .../twirl/spark/deploy/master/app_row.scala.html | 20 +++ .../twirl/spark/deploy/master/app_table.scala.html | 21 +++ .../spark/deploy/master/executor_row.scala.html | 6 +- .../twirl/spark/deploy/master/index.scala.html | 16 +- .../spark/deploy/master/job_details.scala.html | 40 ----- .../twirl/spark/deploy/master/job_row.scala.html | 20 --- .../twirl/spark/deploy/master/job_table.scala.html | 21 --- .../spark/deploy/worker/executor_row.scala.html | 10 +- .../main/scala/spark/streaming/Checkpoint.scala | 2 +- .../scala/spark/streaming/StreamingContext.scala | 10 +- .../streaming/api/java/JavaStreamingContext.scala | 6 +- 37 files changed, 386 insertions(+), 382 deletions(-) create mode 100644 core/src/main/scala/spark/deploy/ApplicationDescription.scala delete mode 100644 core/src/main/scala/spark/deploy/JobDescription.scala create mode 100644 core/src/main/scala/spark/deploy/master/ApplicationInfo.scala create mode 100644 core/src/main/scala/spark/deploy/master/ApplicationState.scala delete mode 100644 core/src/main/scala/spark/deploy/master/JobInfo.scala delete mode 100644 core/src/main/scala/spark/deploy/master/JobState.scala create mode 100644 core/src/main/twirl/spark/deploy/master/app_details.scala.html create mode 100644 core/src/main/twirl/spark/deploy/master/app_row.scala.html create mode 100644 core/src/main/twirl/spark/deploy/master/app_table.scala.html delete mode 100644 core/src/main/twirl/spark/deploy/master/job_details.scala.html delete mode 100644 core/src/main/twirl/spark/deploy/master/job_row.scala.html delete mode 100644 core/src/main/twirl/spark/deploy/master/job_table.scala.html (limited to 'streaming') diff --git a/core/src/main/scala/spark/SparkContext.scala b/core/src/main/scala/spark/SparkContext.scala index f299b7ea46..d39767c3b3 100644 --- a/core/src/main/scala/spark/SparkContext.scala +++ b/core/src/main/scala/spark/SparkContext.scala @@ -53,7 +53,7 @@ import storage.{StorageStatus, StorageUtils, RDDInfo} * cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster. * * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI. + * @param appName A name for your application, to display on the cluster web UI. * @param sparkHome Location where Spark is installed on cluster nodes. * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. @@ -61,7 +61,7 @@ import storage.{StorageStatus, StorageUtils, RDDInfo} */ class SparkContext( val master: String, - val jobName: String, + val appName: String, val sparkHome: String = null, val jars: Seq[String] = Nil, environment: Map[String, String] = Map()) @@ -143,7 +143,7 @@ class SparkContext( case SPARK_REGEX(sparkUrl) => val scheduler = new ClusterScheduler(this) - val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, jobName) + val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, appName) scheduler.initialize(backend) scheduler @@ -162,7 +162,7 @@ class SparkContext( val localCluster = new LocalSparkCluster( numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt) val sparkUrl = localCluster.start() - val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, jobName) + val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, appName) scheduler.initialize(backend) backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => { localCluster.stop() @@ -178,9 +178,9 @@ class SparkContext( val coarseGrained = System.getProperty("spark.mesos.coarse", "false").toBoolean val masterWithoutProtocol = master.replaceFirst("^mesos://", "") // Strip initial mesos:// val backend = if (coarseGrained) { - new CoarseMesosSchedulerBackend(scheduler, this, masterWithoutProtocol, jobName) + new CoarseMesosSchedulerBackend(scheduler, this, masterWithoutProtocol, appName) } else { - new MesosSchedulerBackend(scheduler, this, masterWithoutProtocol, jobName) + new MesosSchedulerBackend(scheduler, this, masterWithoutProtocol, appName) } scheduler.initialize(backend) scheduler diff --git a/core/src/main/scala/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/spark/api/java/JavaSparkContext.scala index 50b8970cd8..f75fc27c7b 100644 --- a/core/src/main/scala/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/spark/api/java/JavaSparkContext.scala @@ -23,41 +23,41 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI */ - def this(master: String, jobName: String) = this(new SparkContext(master, jobName)) + def this(master: String, appName: String) = this(new SparkContext(master, appName)) /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI * @param sparkHome The SPARK_HOME directory on the slave nodes * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. */ - def this(master: String, jobName: String, sparkHome: String, jarFile: String) = - this(new SparkContext(master, jobName, sparkHome, Seq(jarFile))) + def this(master: String, appName: String, sparkHome: String, jarFile: String) = + this(new SparkContext(master, appName, sparkHome, Seq(jarFile))) /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI * @param sparkHome The SPARK_HOME directory on the slave nodes * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. */ - def this(master: String, jobName: String, sparkHome: String, jars: Array[String]) = - this(new SparkContext(master, jobName, sparkHome, jars.toSeq)) + def this(master: String, appName: String, sparkHome: String, jars: Array[String]) = + this(new SparkContext(master, appName, sparkHome, jars.toSeq)) /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI * @param sparkHome The SPARK_HOME directory on the slave nodes * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. * @param environment Environment variables to set on worker nodes */ - def this(master: String, jobName: String, sparkHome: String, jars: Array[String], + def this(master: String, appName: String, sparkHome: String, jars: Array[String], environment: JMap[String, String]) = - this(new SparkContext(master, jobName, sparkHome, jars.toSeq, environment)) + this(new SparkContext(master, appName, sparkHome, jars.toSeq, environment)) private[spark] val env = sc.env diff --git a/core/src/main/scala/spark/api/python/PythonPartitioner.scala b/core/src/main/scala/spark/api/python/PythonPartitioner.scala index 519e310323..d618c098c2 100644 --- a/core/src/main/scala/spark/api/python/PythonPartitioner.scala +++ b/core/src/main/scala/spark/api/python/PythonPartitioner.scala @@ -9,7 +9,7 @@ import java.util.Arrays * * Stores the unique id() of the Python-side partitioning function so that it is incorporated into * equality comparisons. Correctness requires that the id is a unique identifier for the - * lifetime of the job (i.e. that it is not re-used as the id of a different partitioning + * lifetime of the program (i.e. that it is not re-used as the id of a different partitioning * function). This can be ensured by using the Python id() function and maintaining a reference * to the Python partitioning function so that its id() is not reused. */ diff --git a/core/src/main/scala/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/spark/deploy/ApplicationDescription.scala new file mode 100644 index 0000000000..6659e53b25 --- /dev/null +++ b/core/src/main/scala/spark/deploy/ApplicationDescription.scala @@ -0,0 +1,14 @@ +package spark.deploy + +private[spark] class ApplicationDescription( + val name: String, + val cores: Int, + val memoryPerSlave: Int, + val command: Command, + val sparkHome: String) + extends Serializable { + + val user = System.getProperty("user.name", "") + + override def toString: String = "ApplicationDescription(" + name + ")" +} diff --git a/core/src/main/scala/spark/deploy/DeployMessage.scala b/core/src/main/scala/spark/deploy/DeployMessage.scala index 1d88d4bc84..3cbf4fdd98 100644 --- a/core/src/main/scala/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/spark/deploy/DeployMessage.scala @@ -1,7 +1,7 @@ package spark.deploy import spark.deploy.ExecutorState.ExecutorState -import spark.deploy.master.{WorkerInfo, JobInfo} +import spark.deploy.master.{WorkerInfo, ApplicationInfo} import spark.deploy.worker.ExecutorRunner import scala.collection.immutable.List @@ -23,7 +23,7 @@ case class RegisterWorker( private[spark] case class ExecutorStateChanged( - jobId: String, + appId: String, execId: Int, state: ExecutorState, message: Option[String], @@ -36,12 +36,12 @@ private[spark] case class Heartbeat(workerId: String) extends DeployMessage private[spark] case class RegisteredWorker(masterWebUiUrl: String) extends DeployMessage private[spark] case class RegisterWorkerFailed(message: String) extends DeployMessage -private[spark] case class KillExecutor(jobId: String, execId: Int) extends DeployMessage +private[spark] case class KillExecutor(appId: String, execId: Int) extends DeployMessage private[spark] case class LaunchExecutor( - jobId: String, + appId: String, execId: Int, - jobDesc: JobDescription, + appDesc: ApplicationDescription, cores: Int, memory: Int, sparkHome: String) @@ -49,12 +49,13 @@ private[spark] case class LaunchExecutor( // Client to Master -private[spark] case class RegisterJob(jobDescription: JobDescription) extends DeployMessage +private[spark] case class RegisterApplication(appDescription: ApplicationDescription) + extends DeployMessage // Master to Client private[spark] -case class RegisteredJob(jobId: String) extends DeployMessage +case class RegisteredApplication(appId: String) extends DeployMessage private[spark] case class ExecutorAdded(id: Int, workerId: String, host: String, cores: Int, memory: Int) @@ -64,7 +65,7 @@ case class ExecutorUpdated(id: Int, state: ExecutorState, message: Option[String exitStatus: Option[Int]) private[spark] -case class JobKilled(message: String) +case class appKilled(message: String) // Internal message in Client @@ -78,7 +79,7 @@ private[spark] case object RequestMasterState private[spark] case class MasterState(host: String, port: Int, workers: Array[WorkerInfo], - activeJobs: Array[JobInfo], completedJobs: Array[JobInfo]) { + activeApps: Array[ApplicationInfo], completedApps: Array[ApplicationInfo]) { def uri = "spark://" + host + ":" + port } diff --git a/core/src/main/scala/spark/deploy/JobDescription.scala b/core/src/main/scala/spark/deploy/JobDescription.scala deleted file mode 100644 index 7160fc05fc..0000000000 --- a/core/src/main/scala/spark/deploy/JobDescription.scala +++ /dev/null @@ -1,14 +0,0 @@ -package spark.deploy - -private[spark] class JobDescription( - val name: String, - val cores: Int, - val memoryPerSlave: Int, - val command: Command, - val sparkHome: String) - extends Serializable { - - val user = System.getProperty("user.name", "") - - override def toString: String = "JobDescription(" + name + ")" -} diff --git a/core/src/main/scala/spark/deploy/JsonProtocol.scala b/core/src/main/scala/spark/deploy/JsonProtocol.scala index 732fa08064..38a6ebfc24 100644 --- a/core/src/main/scala/spark/deploy/JsonProtocol.scala +++ b/core/src/main/scala/spark/deploy/JsonProtocol.scala @@ -1,6 +1,6 @@ package spark.deploy -import master.{JobInfo, WorkerInfo} +import master.{ApplicationInfo, WorkerInfo} import worker.ExecutorRunner import cc.spray.json._ @@ -20,8 +20,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { ) } - implicit object JobInfoJsonFormat extends RootJsonWriter[JobInfo] { - def write(obj: JobInfo) = JsObject( + implicit object AppInfoJsonFormat extends RootJsonWriter[ApplicationInfo] { + def write(obj: ApplicationInfo) = JsObject( "starttime" -> JsNumber(obj.startTime), "id" -> JsString(obj.id), "name" -> JsString(obj.desc.name), @@ -31,8 +31,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { "submitdate" -> JsString(obj.submitDate.toString)) } - implicit object JobDescriptionJsonFormat extends RootJsonWriter[JobDescription] { - def write(obj: JobDescription) = JsObject( + implicit object AppDescriptionJsonFormat extends RootJsonWriter[ApplicationDescription] { + def write(obj: ApplicationDescription) = JsObject( "name" -> JsString(obj.name), "cores" -> JsNumber(obj.cores), "memoryperslave" -> JsNumber(obj.memoryPerSlave), @@ -44,8 +44,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { def write(obj: ExecutorRunner) = JsObject( "id" -> JsNumber(obj.execId), "memory" -> JsNumber(obj.memory), - "jobid" -> JsString(obj.jobId), - "jobdesc" -> obj.jobDesc.toJson.asJsObject + "appid" -> JsString(obj.appId), + "appdesc" -> obj.appDesc.toJson.asJsObject ) } @@ -57,8 +57,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { "coresused" -> JsNumber(obj.workers.map(_.coresUsed).sum), "memory" -> JsNumber(obj.workers.map(_.memory).sum), "memoryused" -> JsNumber(obj.workers.map(_.memoryUsed).sum), - "activejobs" -> JsArray(obj.activeJobs.toList.map(_.toJson)), - "completedjobs" -> JsArray(obj.completedJobs.toList.map(_.toJson)) + "activeapps" -> JsArray(obj.activeApps.toList.map(_.toJson)), + "completedapps" -> JsArray(obj.completedApps.toList.map(_.toJson)) ) } diff --git a/core/src/main/scala/spark/deploy/client/Client.scala b/core/src/main/scala/spark/deploy/client/Client.scala index e01181d1b2..1a95524cf9 100644 --- a/core/src/main/scala/spark/deploy/client/Client.scala +++ b/core/src/main/scala/spark/deploy/client/Client.scala @@ -8,25 +8,25 @@ import akka.pattern.AskTimeoutException import spark.{SparkException, Logging} import akka.remote.RemoteClientLifeCycleEvent import akka.remote.RemoteClientShutdown -import spark.deploy.RegisterJob +import spark.deploy.RegisterApplication import spark.deploy.master.Master import akka.remote.RemoteClientDisconnected import akka.actor.Terminated import akka.dispatch.Await /** - * The main class used to talk to a Spark deploy cluster. Takes a master URL, a job description, - * and a listener for job events, and calls back the listener when various events occur. + * The main class used to talk to a Spark deploy cluster. Takes a master URL, an app description, + * and a listener for cluster events, and calls back the listener when various events occur. */ private[spark] class Client( actorSystem: ActorSystem, masterUrl: String, - jobDescription: JobDescription, + appDescription: ApplicationDescription, listener: ClientListener) extends Logging { var actor: ActorRef = null - var jobId: String = null + var appId: String = null class ClientActor extends Actor with Logging { var master: ActorRef = null @@ -38,7 +38,7 @@ private[spark] class Client( try { master = context.actorFor(Master.toAkkaUrl(masterUrl)) masterAddress = master.path.address - master ! RegisterJob(jobDescription) + master ! RegisterApplication(appDescription) context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent]) context.watch(master) // Doesn't work with remote actors, but useful for testing } catch { @@ -50,17 +50,17 @@ private[spark] class Client( } override def receive = { - case RegisteredJob(jobId_) => - jobId = jobId_ - listener.connected(jobId) + case RegisteredApplication(appId_) => + appId = appId_ + listener.connected(appId) case ExecutorAdded(id: Int, workerId: String, host: String, cores: Int, memory: Int) => - val fullId = jobId + "/" + id + val fullId = appId + "/" + id logInfo("Executor added: %s on %s (%s) with %d cores".format(fullId, workerId, host, cores)) listener.executorAdded(fullId, workerId, host, cores, memory) case ExecutorUpdated(id, state, message, exitStatus) => - val fullId = jobId + "/" + id + val fullId = appId + "/" + id val messageText = message.map(s => " (" + s + ")").getOrElse("") logInfo("Executor updated: %s is now %s%s".format(fullId, state, messageText)) if (ExecutorState.isFinished(state)) { diff --git a/core/src/main/scala/spark/deploy/client/ClientListener.scala b/core/src/main/scala/spark/deploy/client/ClientListener.scala index 7035f4b394..b7008321df 100644 --- a/core/src/main/scala/spark/deploy/client/ClientListener.scala +++ b/core/src/main/scala/spark/deploy/client/ClientListener.scala @@ -8,7 +8,7 @@ package spark.deploy.client * Users of this API should *not* block inside the callback methods. */ private[spark] trait ClientListener { - def connected(jobId: String): Unit + def connected(appId: String): Unit def disconnected(): Unit diff --git a/core/src/main/scala/spark/deploy/client/TestClient.scala b/core/src/main/scala/spark/deploy/client/TestClient.scala index 8764c400e2..dc004b59ca 100644 --- a/core/src/main/scala/spark/deploy/client/TestClient.scala +++ b/core/src/main/scala/spark/deploy/client/TestClient.scala @@ -2,13 +2,13 @@ package spark.deploy.client import spark.util.AkkaUtils import spark.{Logging, Utils} -import spark.deploy.{Command, JobDescription} +import spark.deploy.{Command, ApplicationDescription} private[spark] object TestClient { class TestListener extends ClientListener with Logging { def connected(id: String) { - logInfo("Connected to master, got job ID " + id) + logInfo("Connected to master, got app ID " + id) } def disconnected() { @@ -24,7 +24,7 @@ private[spark] object TestClient { def main(args: Array[String]) { val url = args(0) val (actorSystem, port) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0) - val desc = new JobDescription( + val desc = new ApplicationDescription( "TestClient", 1, 512, Command("spark.deploy.client.TestExecutor", Seq(), Map()), "dummy-spark-home") val listener = new TestListener val client = new Client(actorSystem, url, desc, listener) diff --git a/core/src/main/scala/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/spark/deploy/master/ApplicationInfo.scala new file mode 100644 index 0000000000..3591a94072 --- /dev/null +++ b/core/src/main/scala/spark/deploy/master/ApplicationInfo.scala @@ -0,0 +1,63 @@ +package spark.deploy.master + +import spark.deploy.ApplicationDescription +import java.util.Date +import akka.actor.ActorRef +import scala.collection.mutable + +private[spark] class ApplicationInfo( + val startTime: Long, + val id: String, + val desc: ApplicationDescription, + val submitDate: Date, + val driver: ActorRef) +{ + var state = ApplicationState.WAITING + var executors = new mutable.HashMap[Int, ExecutorInfo] + var coresGranted = 0 + var endTime = -1L + + private var nextExecutorId = 0 + + def newExecutorId(): Int = { + val id = nextExecutorId + nextExecutorId += 1 + id + } + + def addExecutor(worker: WorkerInfo, cores: Int): ExecutorInfo = { + val exec = new ExecutorInfo(newExecutorId(), this, worker, cores, desc.memoryPerSlave) + executors(exec.id) = exec + coresGranted += cores + exec + } + + def removeExecutor(exec: ExecutorInfo) { + executors -= exec.id + coresGranted -= exec.cores + } + + def coresLeft: Int = desc.cores - coresGranted + + private var _retryCount = 0 + + def retryCount = _retryCount + + def incrementRetryCount = { + _retryCount += 1 + _retryCount + } + + def markFinished(endState: ApplicationState.Value) { + state = endState + endTime = System.currentTimeMillis() + } + + def duration: Long = { + if (endTime != -1) { + endTime - startTime + } else { + System.currentTimeMillis() - startTime + } + } +} diff --git a/core/src/main/scala/spark/deploy/master/ApplicationState.scala b/core/src/main/scala/spark/deploy/master/ApplicationState.scala new file mode 100644 index 0000000000..15016b388d --- /dev/null +++ b/core/src/main/scala/spark/deploy/master/ApplicationState.scala @@ -0,0 +1,11 @@ +package spark.deploy.master + +private[spark] object ApplicationState + extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED") { + + type ApplicationState = Value + + val WAITING, RUNNING, FINISHED, FAILED = Value + + val MAX_NUM_RETRY = 10 +} diff --git a/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala b/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala index 1db2c32633..48e6055fb5 100644 --- a/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala +++ b/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala @@ -4,12 +4,12 @@ import spark.deploy.ExecutorState private[spark] class ExecutorInfo( val id: Int, - val job: JobInfo, + val application: ApplicationInfo, val worker: WorkerInfo, val cores: Int, val memory: Int) { var state = ExecutorState.LAUNCHING - def fullId: String = job.id + "/" + id + def fullId: String = application.id + "/" + id } diff --git a/core/src/main/scala/spark/deploy/master/JobInfo.scala b/core/src/main/scala/spark/deploy/master/JobInfo.scala deleted file mode 100644 index a274b21c34..0000000000 --- a/core/src/main/scala/spark/deploy/master/JobInfo.scala +++ /dev/null @@ -1,63 +0,0 @@ -package spark.deploy.master - -import spark.deploy.JobDescription -import java.util.Date -import akka.actor.ActorRef -import scala.collection.mutable - -private[spark] class JobInfo( - val startTime: Long, - val id: String, - val desc: JobDescription, - val submitDate: Date, - val driver: ActorRef) -{ - var state = JobState.WAITING - var executors = new mutable.HashMap[Int, ExecutorInfo] - var coresGranted = 0 - var endTime = -1L - - private var nextExecutorId = 0 - - def newExecutorId(): Int = { - val id = nextExecutorId - nextExecutorId += 1 - id - } - - def addExecutor(worker: WorkerInfo, cores: Int): ExecutorInfo = { - val exec = new ExecutorInfo(newExecutorId(), this, worker, cores, desc.memoryPerSlave) - executors(exec.id) = exec - coresGranted += cores - exec - } - - def removeExecutor(exec: ExecutorInfo) { - executors -= exec.id - coresGranted -= exec.cores - } - - def coresLeft: Int = desc.cores - coresGranted - - private var _retryCount = 0 - - def retryCount = _retryCount - - def incrementRetryCount = { - _retryCount += 1 - _retryCount - } - - def markFinished(endState: JobState.Value) { - state = endState - endTime = System.currentTimeMillis() - } - - def duration: Long = { - if (endTime != -1) { - endTime - startTime - } else { - System.currentTimeMillis() - startTime - } - } -} diff --git a/core/src/main/scala/spark/deploy/master/JobState.scala b/core/src/main/scala/spark/deploy/master/JobState.scala deleted file mode 100644 index 2b70cf0191..0000000000 --- a/core/src/main/scala/spark/deploy/master/JobState.scala +++ /dev/null @@ -1,9 +0,0 @@ -package spark.deploy.master - -private[spark] object JobState extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED") { - type JobState = Value - - val WAITING, RUNNING, FINISHED, FAILED = Value - - val MAX_NUM_RETRY = 10 -} diff --git a/core/src/main/scala/spark/deploy/master/Master.scala b/core/src/main/scala/spark/deploy/master/Master.scala index a5de23261c..1cd68a2aa6 100644 --- a/core/src/main/scala/spark/deploy/master/Master.scala +++ b/core/src/main/scala/spark/deploy/master/Master.scala @@ -16,22 +16,22 @@ import spark.util.AkkaUtils private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging { - val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For job IDs + val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs val WORKER_TIMEOUT = System.getProperty("spark.worker.timeout", "60").toLong * 1000 - var nextJobNumber = 0 + var nextAppNumber = 0 val workers = new HashSet[WorkerInfo] val idToWorker = new HashMap[String, WorkerInfo] val actorToWorker = new HashMap[ActorRef, WorkerInfo] val addressToWorker = new HashMap[Address, WorkerInfo] - val jobs = new HashSet[JobInfo] - val idToJob = new HashMap[String, JobInfo] - val actorToJob = new HashMap[ActorRef, JobInfo] - val addressToJob = new HashMap[Address, JobInfo] + val apps = new HashSet[ApplicationInfo] + val idToApp = new HashMap[String, ApplicationInfo] + val actorToApp = new HashMap[ActorRef, ApplicationInfo] + val addressToApp = new HashMap[Address, ApplicationInfo] - val waitingJobs = new ArrayBuffer[JobInfo] - val completedJobs = new ArrayBuffer[JobInfo] + val waitingApps = new ArrayBuffer[ApplicationInfo] + val completedApps = new ArrayBuffer[ApplicationInfo] val masterPublicAddress = { val envVar = System.getenv("SPARK_PUBLIC_DNS") @@ -39,9 +39,9 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } // As a temporary workaround before better ways of configuring memory, we allow users to set - // a flag that will perform round-robin scheduling across the nodes (spreading out each job - // among all the nodes) instead of trying to consolidate each job onto a small # of nodes. - val spreadOutJobs = System.getProperty("spark.deploy.spreadOut", "false").toBoolean + // a flag that will perform round-robin scheduling across the nodes (spreading out each app + // among all the nodes) instead of trying to consolidate each app onto a small # of nodes. + val spreadOutApps = System.getProperty("spark.deploy.spreadOut", "false").toBoolean override def preStart() { logInfo("Starting Spark master at spark://" + ip + ":" + port) @@ -76,41 +76,41 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } } - case RegisterJob(description) => { - logInfo("Registering job " + description.name) - val job = addJob(description, sender) - logInfo("Registered job " + description.name + " with ID " + job.id) - waitingJobs += job + case RegisterApplication(description) => { + logInfo("Registering app " + description.name) + val app = addApplication(description, sender) + logInfo("Registered app " + description.name + " with ID " + app.id) + waitingApps += app context.watch(sender) // This doesn't work with remote actors but helps for testing - sender ! RegisteredJob(job.id) + sender ! RegisteredApplication(app.id) schedule() } - case ExecutorStateChanged(jobId, execId, state, message, exitStatus) => { - val execOption = idToJob.get(jobId).flatMap(job => job.executors.get(execId)) + case ExecutorStateChanged(appId, execId, state, message, exitStatus) => { + val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId)) execOption match { case Some(exec) => { exec.state = state - exec.job.driver ! ExecutorUpdated(execId, state, message, exitStatus) + exec.application.driver ! ExecutorUpdated(execId, state, message, exitStatus) if (ExecutorState.isFinished(state)) { - val jobInfo = idToJob(jobId) - // Remove this executor from the worker and job + val appInfo = idToApp(appId) + // Remove this executor from the worker and app logInfo("Removing executor " + exec.fullId + " because it is " + state) - jobInfo.removeExecutor(exec) + appInfo.removeExecutor(exec) exec.worker.removeExecutor(exec) // Only retry certain number of times so we don't go into an infinite loop. - if (jobInfo.incrementRetryCount < JobState.MAX_NUM_RETRY) { + if (appInfo.incrementRetryCount < ApplicationState.MAX_NUM_RETRY) { schedule() } else { - logError("Job %s with ID %s failed %d times, removing it".format( - jobInfo.desc.name, jobInfo.id, jobInfo.retryCount)) - removeJob(jobInfo) + logError("Application %s with ID %s failed %d times, removing it".format( + appInfo.desc.name, appInfo.id, appInfo.retryCount)) + removeApplication(appInfo) } } } case None => - logWarning("Got status update for unknown executor " + jobId + "/" + execId) + logWarning("Got status update for unknown executor " + appId + "/" + execId) } } @@ -124,53 +124,53 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } case Terminated(actor) => { - // The disconnected actor could've been either a worker or a job; remove whichever of + // The disconnected actor could've been either a worker or an app; remove whichever of // those we have an entry for in the corresponding actor hashmap actorToWorker.get(actor).foreach(removeWorker) - actorToJob.get(actor).foreach(removeJob) + actorToApp.get(actor).foreach(removeApplication) } case RemoteClientDisconnected(transport, address) => { - // The disconnected client could've been either a worker or a job; remove whichever it was + // The disconnected client could've been either a worker or an app; remove whichever it was addressToWorker.get(address).foreach(removeWorker) - addressToJob.get(address).foreach(removeJob) + addressToApp.get(address).foreach(removeApplication) } case RemoteClientShutdown(transport, address) => { - // The disconnected client could've been either a worker or a job; remove whichever it was + // The disconnected client could've been either a worker or an app; remove whichever it was addressToWorker.get(address).foreach(removeWorker) - addressToJob.get(address).foreach(removeJob) + addressToApp.get(address).foreach(removeApplication) } case RequestMasterState => { - sender ! MasterState(ip, port, workers.toArray, jobs.toArray, completedJobs.toArray) + sender ! MasterState(ip, port, workers.toArray, apps.toArray, completedApps.toArray) } } /** - * Can a job use the given worker? True if the worker has enough memory and we haven't already - * launched an executor for the job on it (right now the standalone backend doesn't like having + * Can an app use the given worker? True if the worker has enough memory and we haven't already + * launched an executor for the app on it (right now the standalone backend doesn't like having * two executors on the same worker). */ - def canUse(job: JobInfo, worker: WorkerInfo): Boolean = { - worker.memoryFree >= job.desc.memoryPerSlave && !worker.hasExecutor(job) + def canUse(app: ApplicationInfo, worker: WorkerInfo): Boolean = { + worker.memoryFree >= app.desc.memoryPerSlave && !worker.hasExecutor(app) } /** - * Schedule the currently available resources among waiting jobs. This method will be called - * every time a new job joins or resource availability changes. + * Schedule the currently available resources among waiting apps. This method will be called + * every time a new app joins or resource availability changes. */ def schedule() { - // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first job - // in the queue, then the second job, etc. - if (spreadOutJobs) { - // Try to spread out each job among all the nodes, until it has all its cores - for (job <- waitingJobs if job.coresLeft > 0) { + // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app + // in the queue, then the second app, etc. + if (spreadOutApps) { + // Try to spread out each app among all the nodes, until it has all its cores + for (app <- waitingApps if app.coresLeft > 0) { val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE) - .filter(canUse(job, _)).sortBy(_.coresFree).reverse + .filter(canUse(app, _)).sortBy(_.coresFree).reverse val numUsable = usableWorkers.length val assigned = new Array[Int](numUsable) // Number of cores to give on each node - var toAssign = math.min(job.coresLeft, usableWorkers.map(_.coresFree).sum) + var toAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum) var pos = 0 while (toAssign > 0) { if (usableWorkers(pos).coresFree - assigned(pos) > 0) { @@ -182,22 +182,22 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor // Now that we've decided how many cores to give on each node, let's actually give them for (pos <- 0 until numUsable) { if (assigned(pos) > 0) { - val exec = job.addExecutor(usableWorkers(pos), assigned(pos)) - launchExecutor(usableWorkers(pos), exec, job.desc.sparkHome) - job.state = JobState.RUNNING + val exec = app.addExecutor(usableWorkers(pos), assigned(pos)) + launchExecutor(usableWorkers(pos), exec, app.desc.sparkHome) + app.state = ApplicationState.RUNNING } } } } else { - // Pack each job into as few nodes as possible until we've assigned all its cores + // Pack each app into as few nodes as possible until we've assigned all its cores for (worker <- workers if worker.coresFree > 0) { - for (job <- waitingJobs if job.coresLeft > 0) { - if (canUse(job, worker)) { - val coresToUse = math.min(worker.coresFree, job.coresLeft) + for (app <- waitingApps if app.coresLeft > 0) { + if (canUse(app, worker)) { + val coresToUse = math.min(worker.coresFree, app.coresLeft) if (coresToUse > 0) { - val exec = job.addExecutor(worker, coresToUse) - launchExecutor(worker, exec, job.desc.sparkHome) - job.state = JobState.RUNNING + val exec = app.addExecutor(worker, coresToUse) + launchExecutor(worker, exec, app.desc.sparkHome) + app.state = ApplicationState.RUNNING } } } @@ -208,8 +208,8 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor def launchExecutor(worker: WorkerInfo, exec: ExecutorInfo, sparkHome: String) { logInfo("Launching executor " + exec.fullId + " on worker " + worker.id) worker.addExecutor(exec) - worker.actor ! LaunchExecutor(exec.job.id, exec.id, exec.job.desc, exec.cores, exec.memory, sparkHome) - exec.job.driver ! ExecutorAdded(exec.id, worker.id, worker.host, exec.cores, exec.memory) + worker.actor ! LaunchExecutor(exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory, sparkHome) + exec.application.driver ! ExecutorAdded(exec.id, worker.id, worker.host, exec.cores, exec.memory) } def addWorker(id: String, host: String, port: Int, cores: Int, memory: Int, webUiPort: Int, @@ -231,46 +231,46 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor actorToWorker -= worker.actor addressToWorker -= worker.actor.path.address for (exec <- worker.executors.values) { - logInfo("Telling job of lost executor: " + exec.id) - exec.job.driver ! ExecutorUpdated(exec.id, ExecutorState.LOST, Some("worker lost"), None) - exec.job.removeExecutor(exec) + logInfo("Telling app of lost executor: " + exec.id) + exec.application.driver ! ExecutorUpdated(exec.id, ExecutorState.LOST, Some("worker lost"), None) + exec.application.removeExecutor(exec) } } - def addJob(desc: JobDescription, driver: ActorRef): JobInfo = { + def addApplication(desc: ApplicationDescription, driver: ActorRef): ApplicationInfo = { val now = System.currentTimeMillis() val date = new Date(now) - val job = new JobInfo(now, newJobId(date), desc, date, driver) - jobs += job - idToJob(job.id) = job - actorToJob(driver) = job - addressToJob(driver.path.address) = job - return job + val app = new ApplicationInfo(now, newApplicationId(date), desc, date, driver) + apps += app + idToApp(app.id) = app + actorToApp(driver) = app + addressToApp(driver.path.address) = app + return app } - def removeJob(job: JobInfo) { - if (jobs.contains(job)) { - logInfo("Removing job " + job.id) - jobs -= job - idToJob -= job.id - actorToJob -= job.driver - addressToWorker -= job.driver.path.address - completedJobs += job // Remember it in our history - waitingJobs -= job - for (exec <- job.executors.values) { + def removeApplication(app: ApplicationInfo) { + if (apps.contains(app)) { + logInfo("Removing app " + app.id) + apps -= app + idToApp -= app.id + actorToApp -= app.driver + addressToWorker -= app.driver.path.address + completedApps += app // Remember it in our history + waitingApps -= app + for (exec <- app.executors.values) { exec.worker.removeExecutor(exec) - exec.worker.actor ! KillExecutor(exec.job.id, exec.id) + exec.worker.actor ! KillExecutor(exec.application.id, exec.id) } - job.markFinished(JobState.FINISHED) // TODO: Mark it as FAILED if it failed + app.markFinished(ApplicationState.FINISHED) // TODO: Mark it as FAILED if it failed schedule() } } - /** Generate a new job ID given a job's submission date */ - def newJobId(submitDate: Date): String = { - val jobId = "job-%s-%04d".format(DATE_FORMAT.format(submitDate), nextJobNumber) - nextJobNumber += 1 - jobId + /** Generate a new app ID given a app's submission date */ + def newApplicationId(submitDate: Date): String = { + val appId = "app-%s-%04d".format(DATE_FORMAT.format(submitDate), nextAppNumber) + nextAppNumber += 1 + appId } /** Check for, and remove, any timed-out workers */ diff --git a/core/src/main/scala/spark/deploy/master/MasterWebUI.scala b/core/src/main/scala/spark/deploy/master/MasterWebUI.scala index 529f72e9da..54faa375fb 100644 --- a/core/src/main/scala/spark/deploy/master/MasterWebUI.scala +++ b/core/src/main/scala/spark/deploy/master/MasterWebUI.scala @@ -40,27 +40,27 @@ class MasterWebUI(val actorSystem: ActorSystem, master: ActorRef) extends Direct } } } ~ - path("job") { - parameters("jobId", 'format ?) { - case (jobId, Some(js)) if (js.equalsIgnoreCase("json")) => + path("app") { + parameters("appId", 'format ?) { + case (appId, Some(js)) if (js.equalsIgnoreCase("json")) => val future = master ? RequestMasterState - val jobInfo = for (masterState <- future.mapTo[MasterState]) yield { - masterState.activeJobs.find(_.id == jobId).getOrElse({ - masterState.completedJobs.find(_.id == jobId).getOrElse(null) + val appInfo = for (masterState <- future.mapTo[MasterState]) yield { + masterState.activeApps.find(_.id == appId).getOrElse({ + masterState.completedApps.find(_.id == appId).getOrElse(null) }) } respondWithMediaType(MediaTypes.`application/json`) { ctx => - ctx.complete(jobInfo.mapTo[JobInfo]) + ctx.complete(appInfo.mapTo[ApplicationInfo]) } - case (jobId, _) => + case (appId, _) => completeWith { val future = master ? RequestMasterState future.map { state => val masterState = state.asInstanceOf[MasterState] - val job = masterState.activeJobs.find(_.id == jobId).getOrElse({ - masterState.completedJobs.find(_.id == jobId).getOrElse(null) + val app = masterState.activeApps.find(_.id == appId).getOrElse({ + masterState.completedApps.find(_.id == appId).getOrElse(null) }) - spark.deploy.master.html.job_details.render(job) + spark.deploy.master.html.app_details.render(app) } } } diff --git a/core/src/main/scala/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/spark/deploy/master/WorkerInfo.scala index 2e467007a0..23df1bb463 100644 --- a/core/src/main/scala/spark/deploy/master/WorkerInfo.scala +++ b/core/src/main/scala/spark/deploy/master/WorkerInfo.scala @@ -37,8 +37,8 @@ private[spark] class WorkerInfo( } } - def hasExecutor(job: JobInfo): Boolean = { - executors.values.exists(_.job == job) + def hasExecutor(app: ApplicationInfo): Boolean = { + executors.values.exists(_.application == app) } def webUiAddress : String = { diff --git a/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala index 69f34e604a..de11771c8e 100644 --- a/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala @@ -1,7 +1,7 @@ package spark.deploy.worker import java.io._ -import spark.deploy.{ExecutorState, ExecutorStateChanged, JobDescription} +import spark.deploy.{ExecutorState, ExecutorStateChanged, ApplicationDescription} import akka.actor.ActorRef import spark.{Utils, Logging} import java.net.{URI, URL} @@ -14,9 +14,9 @@ import spark.deploy.ExecutorStateChanged * Manages the execution of one executor process. */ private[spark] class ExecutorRunner( - val jobId: String, + val appId: String, val execId: Int, - val jobDesc: JobDescription, + val appDesc: ApplicationDescription, val cores: Int, val memory: Int, val worker: ActorRef, @@ -26,7 +26,7 @@ private[spark] class ExecutorRunner( val workDir: File) extends Logging { - val fullId = jobId + "/" + execId + val fullId = appId + "/" + execId var workerThread: Thread = null var process: Process = null var shutdownHook: Thread = null @@ -60,7 +60,7 @@ private[spark] class ExecutorRunner( process.destroy() process.waitFor() } - worker ! ExecutorStateChanged(jobId, execId, ExecutorState.KILLED, None, None) + worker ! ExecutorStateChanged(appId, execId, ExecutorState.KILLED, None, None) Runtime.getRuntime.removeShutdownHook(shutdownHook) } } @@ -74,10 +74,10 @@ private[spark] class ExecutorRunner( } def buildCommandSeq(): Seq[String] = { - val command = jobDesc.command - val script = if (System.getProperty("os.name").startsWith("Windows")) "run.cmd" else "run"; + val command = appDesc.command + val script = if (System.getProperty("os.name").startsWith("Windows")) "run.cmd" else "run" val runScript = new File(sparkHome, script).getCanonicalPath - Seq(runScript, command.mainClass) ++ (command.arguments ++ Seq(jobId)).map(substituteVariables) + Seq(runScript, command.mainClass) ++ (command.arguments ++ Seq(appId)).map(substituteVariables) } /** Spawn a thread that will redirect a given stream to a file */ @@ -96,12 +96,12 @@ private[spark] class ExecutorRunner( } /** - * Download and run the executor described in our JobDescription + * Download and run the executor described in our ApplicationDescription */ def fetchAndRunExecutor() { try { // Create the executor's working directory - val executorDir = new File(workDir, jobId + "/" + execId) + val executorDir = new File(workDir, appId + "/" + execId) if (!executorDir.mkdirs()) { throw new IOException("Failed to create directory " + executorDir) } @@ -110,7 +110,7 @@ private[spark] class ExecutorRunner( val command = buildCommandSeq() val builder = new ProcessBuilder(command: _*).directory(executorDir) val env = builder.environment() - for ((key, value) <- jobDesc.command.environment) { + for ((key, value) <- appDesc.command.environment) { env.put(key, value) } env.put("SPARK_MEM", memory.toString + "m") @@ -128,7 +128,7 @@ private[spark] class ExecutorRunner( // times on the same machine. val exitCode = process.waitFor() val message = "Command exited with code " + exitCode - worker ! ExecutorStateChanged(jobId, execId, ExecutorState.FAILED, Some(message), + worker ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(message), Some(exitCode)) } catch { case interrupted: InterruptedException => @@ -140,7 +140,7 @@ private[spark] class ExecutorRunner( process.destroy() } val message = e.getClass + ": " + e.getMessage - worker ! ExecutorStateChanged(jobId, execId, ExecutorState.FAILED, Some(message), None) + worker ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(message), None) } } } diff --git a/core/src/main/scala/spark/deploy/worker/Worker.scala b/core/src/main/scala/spark/deploy/worker/Worker.scala index 924935a5fd..2bbc931316 100644 --- a/core/src/main/scala/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/spark/deploy/worker/Worker.scala @@ -109,19 +109,19 @@ private[spark] class Worker( logError("Worker registration failed: " + message) System.exit(1) - case LaunchExecutor(jobId, execId, jobDesc, cores_, memory_, execSparkHome_) => - logInfo("Asked to launch executor %s/%d for %s".format(jobId, execId, jobDesc.name)) + case LaunchExecutor(appId, execId, appDesc, cores_, memory_, execSparkHome_) => + logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) val manager = new ExecutorRunner( - jobId, execId, jobDesc, cores_, memory_, self, workerId, ip, new File(execSparkHome_), workDir) - executors(jobId + "/" + execId) = manager + appId, execId, appDesc, cores_, memory_, self, workerId, ip, new File(execSparkHome_), workDir) + executors(appId + "/" + execId) = manager manager.start() coresUsed += cores_ memoryUsed += memory_ - master ! ExecutorStateChanged(jobId, execId, ExecutorState.RUNNING, None, None) + master ! ExecutorStateChanged(appId, execId, ExecutorState.RUNNING, None, None) - case ExecutorStateChanged(jobId, execId, state, message, exitStatus) => - master ! ExecutorStateChanged(jobId, execId, state, message, exitStatus) - val fullId = jobId + "/" + execId + case ExecutorStateChanged(appId, execId, state, message, exitStatus) => + master ! ExecutorStateChanged(appId, execId, state, message, exitStatus) + val fullId = appId + "/" + execId if (ExecutorState.isFinished(state)) { val executor = executors(fullId) logInfo("Executor " + fullId + " finished with state " + state + @@ -133,8 +133,8 @@ private[spark] class Worker( memoryUsed -= executor.memory } - case KillExecutor(jobId, execId) => - val fullId = jobId + "/" + execId + case KillExecutor(appId, execId) => + val fullId = appId + "/" + execId executors.get(fullId) match { case Some(executor) => logInfo("Asked to kill executor " + fullId) diff --git a/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala index 37524a7c82..08f02bad80 100644 --- a/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala +++ b/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala @@ -92,7 +92,7 @@ private[spark] class WorkerArguments(args: Array[String]) { "Options:\n" + " -c CORES, --cores CORES Number of cores to use\n" + " -m MEM, --memory MEM Amount of memory to use (e.g. 1000M, 2G)\n" + - " -d DIR, --work-dir DIR Directory to run jobs in (default: SPARK_HOME/work)\n" + + " -d DIR, --work-dir DIR Directory to run apps in (default: SPARK_HOME/work)\n" + " -i IP, --ip IP IP address or DNS name to listen on\n" + " -p PORT, --port PORT Port to listen on (default: random)\n" + " --webui-port PORT Port for web UI (default: 8081)") diff --git a/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala b/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala index ef81f072a3..135cc2e86c 100644 --- a/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala +++ b/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala @@ -41,9 +41,9 @@ class WorkerWebUI(val actorSystem: ActorSystem, worker: ActorRef) extends Direct } } ~ path("log") { - parameters("jobId", "executorId", "logType") { (jobId, executorId, logType) => + parameters("appId", "executorId", "logType") { (appId, executorId, logType) => respondWithMediaType(cc.spray.http.MediaTypes.`text/plain`) { - getFromFileName("work/" + jobId + "/" + executorId + "/" + logType) + getFromFileName("work/" + appId + "/" + executorId + "/" + logType) } } } ~ diff --git a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index e77355c6cd..bb289c9cf3 100644 --- a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -2,14 +2,14 @@ package spark.scheduler.cluster import spark.{Utils, Logging, SparkContext} import spark.deploy.client.{Client, ClientListener} -import spark.deploy.{Command, JobDescription} +import spark.deploy.{Command, ApplicationDescription} import scala.collection.mutable.HashMap private[spark] class SparkDeploySchedulerBackend( scheduler: ClusterScheduler, sc: SparkContext, master: String, - jobName: String) + appName: String) extends StandaloneSchedulerBackend(scheduler, sc.env.actorSystem) with ClientListener with Logging { @@ -29,10 +29,11 @@ private[spark] class SparkDeploySchedulerBackend( StandaloneSchedulerBackend.ACTOR_NAME) val args = Seq(driverUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}") val command = Command("spark.executor.StandaloneExecutorBackend", args, sc.executorEnvs) - val sparkHome = sc.getSparkHome().getOrElse(throw new IllegalArgumentException("must supply spark home for spark standalone")) - val jobDesc = new JobDescription(jobName, maxCores, executorMemory, command, sparkHome) + val sparkHome = sc.getSparkHome().getOrElse( + throw new IllegalArgumentException("must supply spark home for spark standalone")) + val appDesc = new ApplicationDescription(appName, maxCores, executorMemory, command, sparkHome) - client = new Client(sc.env.actorSystem, master, jobDesc, this) + client = new Client(sc.env.actorSystem, master, appDesc, this) client.start() } @@ -45,8 +46,8 @@ private[spark] class SparkDeploySchedulerBackend( } } - override def connected(jobId: String) { - logInfo("Connected to Spark cluster with job ID " + jobId) + override def connected(appId: String) { + logInfo("Connected to Spark cluster with app ID " + appId) } override def disconnected() { diff --git a/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala index 7caf06e917..f4a2994b6d 100644 --- a/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala @@ -28,7 +28,7 @@ private[spark] class CoarseMesosSchedulerBackend( scheduler: ClusterScheduler, sc: SparkContext, master: String, - frameworkName: String) + appName: String) extends StandaloneSchedulerBackend(scheduler, sc.env.actorSystem) with MScheduler with Logging { @@ -76,7 +76,7 @@ private[spark] class CoarseMesosSchedulerBackend( setDaemon(true) override def run() { val scheduler = CoarseMesosSchedulerBackend.this - val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(frameworkName).build() + val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(appName).build() driver = new MesosSchedulerDriver(scheduler, fwInfo, master) try { { val ret = driver.run() diff --git a/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala index 300766d0f5..ca7fab4cc5 100644 --- a/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala @@ -24,7 +24,7 @@ private[spark] class MesosSchedulerBackend( scheduler: ClusterScheduler, sc: SparkContext, master: String, - frameworkName: String) + appName: String) extends SchedulerBackend with MScheduler with Logging { @@ -49,7 +49,7 @@ private[spark] class MesosSchedulerBackend( setDaemon(true) override def run() { val scheduler = MesosSchedulerBackend.this - val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(frameworkName).build() + val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(appName).build() driver = new MesosSchedulerDriver(scheduler, fwInfo, master) try { val ret = driver.run() diff --git a/core/src/main/twirl/spark/deploy/master/app_details.scala.html b/core/src/main/twirl/spark/deploy/master/app_details.scala.html new file mode 100644 index 0000000000..301a7e2124 --- /dev/null +++ b/core/src/main/twirl/spark/deploy/master/app_details.scala.html @@ -0,0 +1,40 @@ +@(app: spark.deploy.master.ApplicationInfo) + +@spark.common.html.layout(title = "Application Details") { + + +
+
+
    +
  • ID: @app.id
  • +
  • Description: @app.desc.name
  • +
  • User: @app.desc.user
  • +
  • Cores: + @app.desc.cores + (@app.coresGranted Granted + @if(app.desc.cores == Integer.MAX_VALUE) { + + } else { + , @app.coresLeft + } + ) +
  • +
  • Memory per Slave: @app.desc.memoryPerSlave
  • +
  • Submit Date: @app.submitDate
  • +
  • State: @app.state
  • +
+
+
+ +
+ + +
+
+

Executor Summary

+
+ @executors_table(app.executors.values.toList) +
+
+ +} diff --git a/core/src/main/twirl/spark/deploy/master/app_row.scala.html b/core/src/main/twirl/spark/deploy/master/app_row.scala.html new file mode 100644 index 0000000000..feb306f35c --- /dev/null +++ b/core/src/main/twirl/spark/deploy/master/app_row.scala.html @@ -0,0 +1,20 @@ +@(app: spark.deploy.master.ApplicationInfo) + +@import spark.Utils +@import spark.deploy.WebUI.formatDate +@import spark.deploy.WebUI.formatDuration + + + + @app.id + + @app.desc.name + + @app.coresGranted + + @Utils.memoryMegabytesToString(app.desc.memoryPerSlave) + @formatDate(app.submitDate) + @app.desc.user + @app.state.toString() + @formatDuration(app.duration) + diff --git a/core/src/main/twirl/spark/deploy/master/app_table.scala.html b/core/src/main/twirl/spark/deploy/master/app_table.scala.html new file mode 100644 index 0000000000..f789cee0f1 --- /dev/null +++ b/core/src/main/twirl/spark/deploy/master/app_table.scala.html @@ -0,0 +1,21 @@ +@(apps: Array[spark.deploy.master.ApplicationInfo]) + + + + + + + + + + + + + + + + @for(j <- apps) { + @app_row(j) + } + +
IDDescriptionCoresMemory per NodeSubmit TimeUserStateDuration
diff --git a/core/src/main/twirl/spark/deploy/master/executor_row.scala.html b/core/src/main/twirl/spark/deploy/master/executor_row.scala.html index 784d692fc2..d2d80fad48 100644 --- a/core/src/main/twirl/spark/deploy/master/executor_row.scala.html +++ b/core/src/main/twirl/spark/deploy/master/executor_row.scala.html @@ -9,7 +9,7 @@ @executor.memory @executor.state - stdout - stderr + stdout + stderr - \ No newline at end of file + diff --git a/core/src/main/twirl/spark/deploy/master/index.scala.html b/core/src/main/twirl/spark/deploy/master/index.scala.html index cb1651c7e1..ac51a39a51 100644 --- a/core/src/main/twirl/spark/deploy/master/index.scala.html +++ b/core/src/main/twirl/spark/deploy/master/index.scala.html @@ -14,7 +14,7 @@ @{state.workers.map(_.coresUsed).sum} Used
  • Memory: @{Utils.memoryMegabytesToString(state.workers.map(_.memory).sum)} Total, @{Utils.memoryMegabytesToString(state.workers.map(_.memoryUsed).sum)} Used
  • -
  • Jobs: @state.activeJobs.size Running, @state.completedJobs.size Completed
  • +
  • Applications: @state.activeApps.size Running, @state.completedApps.size Completed
  • @@ -22,7 +22,7 @@
    -

    Cluster Summary

    +

    Workers


    @worker_table(state.workers.sortBy(_.id))
    @@ -30,23 +30,23 @@
    - +
    -

    Running Jobs

    +

    Running Applications


    - @job_table(state.activeJobs.sortBy(_.startTime).reverse) + @app_table(state.activeApps.sortBy(_.startTime).reverse)

    - +
    -

    Completed Jobs

    +

    Completed Applications


    - @job_table(state.completedJobs.sortBy(_.endTime).reverse) + @app_table(state.completedApps.sortBy(_.endTime).reverse)
    diff --git a/core/src/main/twirl/spark/deploy/master/job_details.scala.html b/core/src/main/twirl/spark/deploy/master/job_details.scala.html deleted file mode 100644 index d02a51b214..0000000000 --- a/core/src/main/twirl/spark/deploy/master/job_details.scala.html +++ /dev/null @@ -1,40 +0,0 @@ -@(job: spark.deploy.master.JobInfo) - -@spark.common.html.layout(title = "Job Details") { - - -
    -
    -
      -
    • ID: @job.id
    • -
    • Description: @job.desc.name
    • -
    • User: @job.desc.user
    • -
    • Cores: - @job.desc.cores - (@job.coresGranted Granted - @if(job.desc.cores == Integer.MAX_VALUE) { - - } else { - , @job.coresLeft - } - ) -
    • -
    • Memory per Slave: @job.desc.memoryPerSlave
    • -
    • Submit Date: @job.submitDate
    • -
    • State: @job.state
    • -
    -
    -
    - -
    - - -
    -
    -

    Executor Summary

    -
    - @executors_table(job.executors.values.toList) -
    -
    - -} diff --git a/core/src/main/twirl/spark/deploy/master/job_row.scala.html b/core/src/main/twirl/spark/deploy/master/job_row.scala.html deleted file mode 100644 index 7c466a6a2c..0000000000 --- a/core/src/main/twirl/spark/deploy/master/job_row.scala.html +++ /dev/null @@ -1,20 +0,0 @@ -@(job: spark.deploy.master.JobInfo) - -@import spark.Utils -@import spark.deploy.WebUI.formatDate -@import spark.deploy.WebUI.formatDuration - - - - @job.id - - @job.desc.name - - @job.coresGranted - - @Utils.memoryMegabytesToString(job.desc.memoryPerSlave) - @formatDate(job.submitDate) - @job.desc.user - @job.state.toString() - @formatDuration(job.duration) - diff --git a/core/src/main/twirl/spark/deploy/master/job_table.scala.html b/core/src/main/twirl/spark/deploy/master/job_table.scala.html deleted file mode 100644 index d267d6e85e..0000000000 --- a/core/src/main/twirl/spark/deploy/master/job_table.scala.html +++ /dev/null @@ -1,21 +0,0 @@ -@(jobs: Array[spark.deploy.master.JobInfo]) - - - - - - - - - - - - - - - - @for(j <- jobs) { - @job_row(j) - } - -
    JobIDDescriptionCoresMemory per NodeSubmit TimeUserStateDuration
    diff --git a/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html b/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html index ea9542461e..dad0a89080 100644 --- a/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html +++ b/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html @@ -8,13 +8,13 @@ @Utils.memoryMegabytesToString(executor.memory)
      -
    • ID: @executor.jobId
    • -
    • Name: @executor.jobDesc.name
    • -
    • User: @executor.jobDesc.user
    • +
    • ID: @executor.appId
    • +
    • Name: @executor.appDesc.name
    • +
    • User: @executor.appDesc.user
    - stdout - stderr + stdout + stderr diff --git a/streaming/src/main/scala/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/spark/streaming/Checkpoint.scala index 2f3adb39c2..80244520a3 100644 --- a/streaming/src/main/scala/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/spark/streaming/Checkpoint.scala @@ -12,7 +12,7 @@ private[streaming] class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time) extends Logging with Serializable { val master = ssc.sc.master - val framework = ssc.sc.jobName + val framework = ssc.sc.appName val sparkHome = ssc.sc.sparkHome val jars = ssc.sc.jars val graph = ssc.graph diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala index 37ba524b48..0cce2b13cf 100644 --- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala @@ -39,11 +39,11 @@ class StreamingContext private ( /** * Creates a StreamingContext by providing the details necessary for creating a new SparkContext. * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param frameworkName A name for your job, to display on the cluster web UI + * @param appName A name for your job, to display on the cluster web UI * @param batchDuration The time interval at which streaming data will be divided into batches */ - def this(master: String, frameworkName: String, batchDuration: Duration) = - this(StreamingContext.createNewSparkContext(master, frameworkName), null, batchDuration) + def this(master: String, appName: String, batchDuration: Duration) = + this(StreamingContext.createNewSparkContext(master, appName), null, batchDuration) /** * Re-creates a StreamingContext from a checkpoint file. @@ -384,14 +384,14 @@ object StreamingContext { new PairDStreamFunctions[K, V](stream) } - protected[streaming] def createNewSparkContext(master: String, frameworkName: String): SparkContext = { + protected[streaming] def createNewSparkContext(master: String, appName: String): SparkContext = { // Set the default cleaner delay to an hour if not already set. // This should be sufficient for even 1 second interval. if (MetadataCleaner.getDelaySeconds < 0) { MetadataCleaner.setDelaySeconds(3600) } - new SparkContext(master, frameworkName) + new SparkContext(master, appName) } protected[streaming] def rddToFileName[T](prefix: String, suffix: String, time: Time): String = { diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala index e7f446a49b..e5b5e9ac23 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala @@ -27,11 +27,11 @@ class JavaStreamingContext(val ssc: StreamingContext) { /** * Creates a StreamingContext. * @param master Name of the Spark Master - * @param frameworkName Name to be used when registering with the scheduler + * @param appName Name to be used when registering with the scheduler * @param batchDuration The time interval at which streaming data will be divided into batches */ - def this(master: String, frameworkName: String, batchDuration: Duration) = - this(new StreamingContext(master, frameworkName, batchDuration)) + def this(master: String, appName: String, batchDuration: Duration) = + this(new StreamingContext(master, appName, batchDuration)) /** * Creates a StreamingContext. -- cgit v1.2.3