Merge branch 'master' into master-merge

Conflicts: core/pom.xml core/src/main/scala/spark/MapOutputTracker.scala core/src/main/scala/spark/RDD.scala core/src/main/scala/spark/RDDCheckpointData.scala core/src/main/scala/spark/SparkContext.scala core/src/main/scala/spark/Utils.scala core/src/main/scala/spark/api/python/PythonRDD.scala core/src/main/scala/spark/deploy/client/Client.scala core/src/main/scala/spark/deploy/master/MasterWebUI.scala core/src/main/scala/spark/deploy/worker/Worker.scala core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala core/src/main/scala/spark/rdd/BlockRDD.scala core/src/main/scala/spark/rdd/ZippedRDD.scala core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala core/src/main/scala/spark/storage/BlockManager.scala core/src/main/scala/spark/storage/BlockManagerMaster.scala core/src/main/scala/spark/storage/BlockManagerMasterActor.scala core/src/main/scala/spark/storage/BlockManagerUI.scala core/src/main/scala/spark/util/AkkaUtils.scala core/src/test/scala/spark/SizeEstimatorSuite.scala pom.xml project/SparkBuild.scala repl/src/main/scala/spark/repl/SparkILoop.scala repl/src/test/scala/spark/repl/ReplSuite.scala streaming/src/main/scala/spark/streaming/StreamingContext.scala streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala streaming/src/main/scala/spark/streaming/dstream/KafkaInputDStream.scala streaming/src/main/scala/spark/streaming/util/MasterFailureTest.scala
author: Prashant Sharma <prashant.s@imaginea.com> 2013-07-03 11:43:26 +0530
committer: Prashant Sharma <prashant.s@imaginea.com> 2013-07-03 11:43:26 +0530
commit: a5f1f6a907b116325c56d38157ec2df76150951e (patch)
tree: 27de949c24a61b2301c7690db9e28992f49ea39c /examples/src
parent: b7794813b181f13801596e8d8c3b4471c0c84f20 (diff)
parent: 6d60fe571a405eb9306a2be1817901316a46f892 (diff)
download: spark-a5f1f6a907b116325c56d38157ec2df76150951e.tar.gz
spark-a5f1f6a907b116325c56d38157ec2df76150951e.tar.bz2
spark-a5f1f6a907b116325c56d38157ec2df76150951e.zip
8 files changed, 302 insertions, 18 deletions
diff --git a/examples/src/main/scala/spark/examples/CassandraTest.scala b/examples/src/main/scala/spark/examples/CassandraTest.scala
new file mode 100644
index 0000000000..0fe1833e83
--- /dev/null
+++ b/examples/src/main/scala/spark/examples/CassandraTest.scala
@@ -0,0 +1,196 @@
+package spark.examples
+
+import org.apache.hadoop.mapreduce.Job
+import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
+import org.apache.cassandra.hadoop.ConfigHelper
+import org.apache.cassandra.hadoop.ColumnFamilyInputFormat
+import org.apache.cassandra.thrift._
+import spark.SparkContext
+import spark.SparkContext._
+import java.nio.ByteBuffer
+import java.util.SortedMap
+import org.apache.cassandra.db.IColumn
+import org.apache.cassandra.utils.ByteBufferUtil
+import scala.collection.JavaConversions._
+
+
+/*
+ * This example demonstrates using Spark with Cassandra with the New Hadoop API and Cassandra
+ * support for Hadoop.
+ *
+ * To run this example, run this file with the following command params -
+ * <spark_master> <cassandra_node> <cassandra_port>
+ *
+ * So if you want to run this on localhost this will be,
+ * local[3] localhost 9160
+ *
+ * The example makes some assumptions:
+ * 1. You have already created a keyspace called casDemo and it has a column family named Words
+ * 2. There are column family has a column named "para" which has test content.
+ *
+ * You can create the content by running the following script at the bottom of this file with
+ * cassandra-cli.
+ *
+ */
+object CassandraTest {
+
+  def main(args: Array[String]) {
+
+    // Get a SparkContext
+    val sc = new SparkContext(args(0), "casDemo")
+
+    // Build the job configuration with ConfigHelper provided by Cassandra
+    val job = new Job()
+    job.setInputFormatClass(classOf[ColumnFamilyInputFormat])
+
+    val host: String = args(1)
+    val port: String = args(2)
+
+    ConfigHelper.setInputInitialAddress(job.getConfiguration(), host)
+    ConfigHelper.setInputRpcPort(job.getConfiguration(), port)
+    ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host)
+    ConfigHelper.setOutputRpcPort(job.getConfiguration(), port)
+    ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words")
+    ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount")
+
+    val predicate = new SlicePredicate()
+    val sliceRange = new SliceRange()
+    sliceRange.setStart(Array.empty[Byte])
+    sliceRange.setFinish(Array.empty[Byte])
+    predicate.setSlice_range(sliceRange)
+    ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate)
+
+    ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
+    ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
+
+    // Make a new Hadoop RDD
+    val casRdd = sc.newAPIHadoopRDD(
+      job.getConfiguration(),
+      classOf[ColumnFamilyInputFormat],
+      classOf[ByteBuffer],
+      classOf[SortedMap[ByteBuffer, IColumn]])
+
+    // Let us first get all the paragraphs from the retrieved rows
+    val paraRdd = casRdd.map {
+      case (key, value) => {
+        ByteBufferUtil.string(value.get(ByteBufferUtil.bytes("para")).value())
+      }
+    }
+
+    // Lets get the word count in paras
+    val counts = paraRdd.flatMap(p => p.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
+
+    counts.collect().foreach {
+      case (word, count) => println(word + ":" + count)
+    }
+
+    counts.map {
+      case (word, count) => {
+        val colWord = new org.apache.cassandra.thrift.Column()
+        colWord.setName(ByteBufferUtil.bytes("word"))
+        colWord.setValue(ByteBufferUtil.bytes(word))
+        colWord.setTimestamp(System.currentTimeMillis)
+
+        val colCount = new org.apache.cassandra.thrift.Column()
+        colCount.setName(ByteBufferUtil.bytes("wcount"))
+        colCount.setValue(ByteBufferUtil.bytes(count.toLong))
+        colCount.setTimestamp(System.currentTimeMillis)
+
+        val outputkey = ByteBufferUtil.bytes(word + "-COUNT-" + System.currentTimeMillis)
+
+        val mutations: java.util.List[Mutation] = new Mutation() :: new Mutation() :: Nil
+        mutations.get(0).setColumn_or_supercolumn(new ColumnOrSuperColumn())
+        mutations.get(0).column_or_supercolumn.setColumn(colWord)
+        mutations.get(1).setColumn_or_supercolumn(new ColumnOrSuperColumn())
+        mutations.get(1).column_or_supercolumn.setColumn(colCount)
+        (outputkey, mutations)
+      }
+    }.saveAsNewAPIHadoopFile("casDemo", classOf[ByteBuffer], classOf[List[Mutation]],
+      classOf[ColumnFamilyOutputFormat], job.getConfiguration)
+  }
+}
+
+/*
+create keyspace casDemo;
+use casDemo;
+
+create column family WordCount with comparator = UTF8Type;
+update column family WordCount with column_metadata =
+  [{column_name: word, validation_class: UTF8Type},
+    {column_name: wcount, validation_class: LongType}];
+
+create column family Words with comparator = UTF8Type;
+update column family Words with column_metadata =
+  [{column_name: book, validation_class: UTF8Type},
+    {column_name: para, validation_class: UTF8Type}];
+
+assume Words keys as utf8;
+
+set Words['3musk001']['book'] = 'The Three Musketeers';
+set Words['3musk001']['para'] = 'On the first Monday of the month of April, 1625, the market
+  town of Meung, in which the author of ROMANCE OF THE ROSE was born, appeared to
+	be in as perfect a state of revolution as if the Huguenots had just made
+	a second La Rochelle of it. Many citizens, seeing the women flying
+	toward the High Street, leaving their children crying at the open doors,
+	hastened to don the cuirass, and supporting their somewhat uncertain
+	courage with a musket or a partisan, directed their steps toward the
+	hostelry of the Jolly Miller, before which was gathered, increasing
+	every minute, a compact group, vociferous and full of curiosity.';
+
+set Words['3musk002']['book'] = 'The Three Musketeers';
+set Words['3musk002']['para'] = 'In those times panics were common, and few days passed without
+  some city or other registering in its archives an event of this kind. There were
+	nobles, who made war against each other; there was the king, who made
+	war against the cardinal; there was Spain, which made war against the
+	king. Then, in addition to these concealed or public, secret or open
+	wars, there were robbers, mendicants, Huguenots, wolves, and scoundrels,
+	who made war upon everybody. The citizens always took up arms readily
+	against thieves, wolves or scoundrels, often against nobles or
+	Huguenots, sometimes against the king, but never against cardinal or
+	Spain. It resulted, then, from this habit that on the said first Monday
+	of April, 1625, the citizens, on hearing the clamor, and seeing neither
+	the red-and-yellow standard nor the livery of the Duc de Richelieu,
+	rushed toward the hostel of the Jolly Miller. When arrived there, the
+	cause of the hubbub was apparent to all';
+
+set Words['3musk003']['book'] = 'The Three Musketeers';
+set Words['3musk003']['para'] = 'You ought, I say, then, to husband the means you have, however
+  large the sum may be; but you ought also to endeavor to perfect yourself in
+	the exercises becoming a gentleman. I will write a letter today to the
+	Director of the Royal Academy, and tomorrow he will admit you without
+	any expense to yourself. Do not refuse this little service. Our
+	best-born and richest gentlemen sometimes solicit it without being able
+	to obtain it. You will learn horsemanship, swordsmanship in all its
+	branches, and dancing. You will make some desirable acquaintances; and
+	from time to time you can call upon me, just to tell me how you are
+	getting on, and to say whether I can be of further service to you.';
+
+
+set Words['thelostworld001']['book'] = 'The Lost World';
+set Words['thelostworld001']['para'] = 'She sat with that proud, delicate profile of hers outlined
+  against the red curtain.  How beautiful she was!  And yet how aloof!  We had been
+	friends, quite good friends; but never could I get beyond the same
+	comradeship which I might have established with one of my
+	fellow-reporters upon the Gazette,--perfectly frank, perfectly kindly,
+	and perfectly unsexual.  My instincts are all against a woman being too
+	frank and at her ease with me.  It is no compliment to a man.  Where
+	the real sex feeling begins, timidity and distrust are its companions,
+	heritage from old wicked days when love and violence went often hand in
+	hand.  The bent head, the averted eye, the faltering voice, the wincing
+	figure--these, and not the unshrinking gaze and frank reply, are the
+	true signals of passion.  Even in my short life I had learned as much
+	as that--or had inherited it in that race memory which we call instinct.';
+
+set Words['thelostworld002']['book'] = 'The Lost World';
+set Words['thelostworld002']['para'] = 'I always liked McArdle, the crabbed, old, round-backed,
+  red-headed news editor, and I rather hoped that he liked me.  Of course, Beaumont was
+	the real boss; but he lived in the rarefied atmosphere of some Olympian
+	height from which he could distinguish nothing smaller than an
+	international crisis or a split in the Cabinet.  Sometimes we saw him
+	passing in lonely majesty to his inner sanctum, with his eyes staring
+	vaguely and his mind hovering over the Balkans or the Persian Gulf.  He
+	was above and beyond us.  But McArdle was his first lieutenant, and it
+	was he that we knew.  The old man nodded as I entered the room, and he
+	pushed his spectacles far up on his bald forehead.';
+
+*/
diff --git a/examples/src/main/scala/spark/examples/HBaseTest.scala b/examples/src/main/scala/spark/examples/HBaseTest.scala
new file mode 100644
index 0000000000..6e910154d4
--- /dev/null
+++ b/examples/src/main/scala/spark/examples/HBaseTest.scala
@@ -0,0 +1,35 @@
+package spark.examples
+
+import spark._
+import spark.rdd.NewHadoopRDD
+import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
+import org.apache.hadoop.hbase.client.HBaseAdmin
+import org.apache.hadoop.hbase.mapreduce.TableInputFormat
+
+object HBaseTest {
+  def main(args: Array[String]) {
+    val sc = new SparkContext(args(0), "HBaseTest",
+      System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
+
+    val conf = HBaseConfiguration.create()
+
+    // Other options for configuring scan behavior are available. More information available at 
+    // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
+    conf.set(TableInputFormat.INPUT_TABLE, args(1))
+
+    // Initialize hBase table if necessary
+    val admin = new HBaseAdmin(conf)
+    if(!admin.isTableAvailable(args(1))) {
+      val tableDesc = new HTableDescriptor(args(1))
+      admin.createTable(tableDesc)
+    }
+
+    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], 
+      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
+      classOf[org.apache.hadoop.hbase.client.Result])
+
+    hBaseRDD.count()
+
+    System.exit(0)
+  }
+}
+\ No newline at end of file
diff --git a/examples/src/main/scala/spark/examples/SparkHdfsLR.scala b/examples/src/main/scala/spark/examples/SparkHdfsLR.scala
index 0f42f405a0..3d080a0257 100644
--- a/examples/src/main/scala/spark/examples/SparkHdfsLR.scala
+++ b/examples/src/main/scala/spark/examples/SparkHdfsLR.scala
@@ -4,6 +4,8 @@ import java.util.Random
 import scala.math.exp
 import spark.util.Vector
 import spark._
+import spark.deploy.SparkHadoopUtil
+import spark.scheduler.InputFormatInfo
 
 /**
  * Logistic regression based classification.
@@ -32,9 +34,13 @@ object SparkHdfsLR {
       System.err.println("Usage: SparkHdfsLR <master> <file> <iters>")
       System.exit(1)
     }
+    val inputPath = args(1)
+    val conf = SparkHadoopUtil.newConfiguration()
     val sc = new SparkContext(args(0), "SparkHdfsLR",
-      System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
-    val lines = sc.textFile(args(1))
+      System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")), Map(), 
+      InputFormatInfo.computePreferredLocations(
+          Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))))
+    val lines = sc.textFile(inputPath)
     val points = lines.map(parsePoint _).cache()
     val ITERATIONS = args(2).toInt
 
diff --git a/examples/src/main/scala/spark/streaming/examples/KafkaWordCount.scala b/examples/src/main/scala/spark/streaming/examples/KafkaWordCount.scala
index c3a9e491ba..9202e65e09 100644
--- a/examples/src/main/scala/spark/streaming/examples/KafkaWordCount.scala
+++ b/examples/src/main/scala/spark/streaming/examples/KafkaWordCount.scala
@@ -37,7 +37,7 @@ object KafkaWordCount {
     ssc.checkpoint("checkpoint")
 
     val topicpMap = topics.split(",").map((_,numThreads.toInt)).toMap
-    val lines = ssc.kafkaStream[String](zkQuorum, group, topicpMap)
+    val lines = ssc.kafkaStream(zkQuorum, group, topicpMap)
     val words = lines.flatMap(_.split(" "))
     val wordCounts = words.map(x => (x, 1l)).reduceByKeyAndWindow(add _, subtract _, Minutes(10), Seconds(2), 2)
     wordCounts.print()
diff --git a/examples/src/main/scala/spark/streaming/examples/StatefulNetworkWordCount.scala b/examples/src/main/scala/spark/streaming/examples/StatefulNetworkWordCount.scala
new file mode 100644
index 0000000000..51c3c9f9b4
--- /dev/null
+++ b/examples/src/main/scala/spark/streaming/examples/StatefulNetworkWordCount.scala
@@ -0,0 +1,50 @@
+package spark.streaming.examples
+
+import spark.streaming._
+import spark.streaming.StreamingContext._
+
+/**
+ * Counts words cumulatively in UTF8 encoded, '\n' delimited text received from the network every second.
+ * Usage: StatefulNetworkWordCount <master> <hostname> <port>
+ *   <master> is the Spark master URL. In local mode, <master> should be 'local[n]' with n > 1.
+ *   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
+ *
+ * To run this on your local machine, you need to first run a Netcat server
+ *    `$ nc -lk 9999`
+ * and then run the example
+ *    `$ ./run spark.streaming.examples.StatefulNetworkWordCount local[2] localhost 9999`
+ */
+object StatefulNetworkWordCount {
+  def main(args: Array[String]) {
+    if (args.length < 3) {
+      System.err.println("Usage: StatefulNetworkWordCount <master> <hostname> <port>\n" +
+        "In local mode, <master> should be 'local[n]' with n > 1")
+      System.exit(1)
+    }
+
+    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
+      val currentCount = values.foldLeft(0)(_ + _)
+
+      val previousCount = state.getOrElse(0)
+
+      Some(currentCount + previousCount)
+    }
+
+    // Create the context with a 1 second batch size
+    val ssc = new StreamingContext(args(0), "NetworkWordCumulativeCountUpdateStateByKey", Seconds(1),
+      System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
+    ssc.checkpoint(".")
+
+    // Create a NetworkInputDStream on target ip:port and count the
+    // words in input stream of \n delimited test (eg. generated by 'nc') 
+    val lines = ssc.socketTextStream(args(1), args(2).toInt)
+    val words = lines.flatMap(_.split(" "))
+    val wordDstream = words.map(x => (x, 1))
+
+    // Update the cumulative count using updateStateByKey
+    // This will give a Dstream made of state (which is the cumulative count of the words)
+    val stateDstream = wordDstream.updateStateByKey[Int](updateFunc)
+    stateDstream.print()
+    ssc.start()
+  }
+}
diff --git a/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala b/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala
index a9642100e3..528778ed72 100644
--- a/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala
+++ b/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdCMS.scala
@@ -26,8 +26,8 @@ import spark.SparkContext._
  */
 object TwitterAlgebirdCMS {
   def main(args: Array[String]) {
-    if (args.length < 3) {
-      System.err.println("Usage: TwitterAlgebirdCMS <master> <twitter_username> <twitter_password>" +
+    if (args.length < 1) {
+      System.err.println("Usage: TwitterAlgebirdCMS <master>" +
         " [filter1] [filter2] ... [filter n]")
       System.exit(1)
     }
@@ -40,12 +40,11 @@ object TwitterAlgebirdCMS {
     // K highest frequency elements to take
     val TOPK = 10
 
-    val Array(master, username, password) = args.slice(0, 3)
-    val filters = args.slice(3, args.length)
+    val (master, filters) = (args.head, args.tail)
 
     val ssc = new StreamingContext(master, "TwitterAlgebirdCMS", Seconds(10),
       System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
-    val stream = ssc.twitterStream(username, password, filters, StorageLevel.MEMORY_ONLY_SER)
+    val stream = ssc.twitterStream(None, filters, StorageLevel.MEMORY_ONLY_SER)
 
     val users = stream.map(status => status.getUser.getId)
 
diff --git a/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdHLL.scala b/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdHLL.scala
index f3288bfb85..896e9fd8af 100644
--- a/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdHLL.scala
+++ b/examples/src/main/scala/spark/streaming/examples/TwitterAlgebirdHLL.scala
@@ -21,20 +21,19 @@ import spark.streaming.dstream.TwitterInputDStream
  */
 object TwitterAlgebirdHLL {
   def main(args: Array[String]) {
-    if (args.length < 3) {
-      System.err.println("Usage: TwitterAlgebirdHLL <master> <twitter_username> <twitter_password>" +
+    if (args.length < 1) {
+      System.err.println("Usage: TwitterAlgebirdHLL <master>" +
         " [filter1] [filter2] ... [filter n]")
       System.exit(1)
     }
 
     /** Bit size parameter for HyperLogLog, trades off accuracy vs size */
     val BIT_SIZE = 12
-    val Array(master, username, password) = args.slice(0, 3)
-    val filters = args.slice(3, args.length)
+    val (master, filters) = (args.head, args.tail)
 
     val ssc = new StreamingContext(master, "TwitterAlgebirdHLL", Seconds(5),
       System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
-    val stream = ssc.twitterStream(username, password, filters, StorageLevel.MEMORY_ONLY_SER)
+    val stream = ssc.twitterStream(None, filters, StorageLevel.MEMORY_ONLY_SER)
 
     val users = stream.map(status => status.getUser.getId)
 
diff --git a/examples/src/main/scala/spark/streaming/examples/TwitterPopularTags.scala b/examples/src/main/scala/spark/streaming/examples/TwitterPopularTags.scala
index 9d4494c6f2..65f0b6d352 100644
--- a/examples/src/main/scala/spark/streaming/examples/TwitterPopularTags.scala
+++ b/examples/src/main/scala/spark/streaming/examples/TwitterPopularTags.scala
@@ -12,18 +12,17 @@ import spark.SparkContext._
  */
 object TwitterPopularTags {
   def main(args: Array[String]) {
-    if (args.length < 3) {
-      System.err.println("Usage: TwitterPopularTags <master> <twitter_username> <twitter_password>" +
+    if (args.length < 1) {
+      System.err.println("Usage: TwitterPopularTags <master>" +
         " [filter1] [filter2] ... [filter n]")
       System.exit(1)
     }
 
-    val Array(master, username, password) = args.slice(0, 3)
-    val filters = args.slice(3, args.length)
+    val (master, filters) = (args.head, args.tail)
 
     val ssc = new StreamingContext(master, "TwitterPopularTags", Seconds(2),
       System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))
-    val stream = ssc.twitterStream(username, password, filters)
+    val stream = ssc.twitterStream(None, filters)
 
     val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
author	Prashant Sharma <prashant.s@imaginea.com>	2013-07-03 11:43:26 +0530
committer	Prashant Sharma <prashant.s@imaginea.com>	2013-07-03 11:43:26 +0530
commit	a5f1f6a907b116325c56d38157ec2df76150951e (patch)
tree	27de949c24a61b2301c7690db9e28992f49ea39c /examples/src
parent	b7794813b181f13801596e8d8c3b4471c0c84f20 (diff)
parent	6d60fe571a405eb9306a2be1817901316a46f892 (diff)
download	spark-a5f1f6a907b116325c56d38157ec2df76150951e.tar.gz spark-a5f1f6a907b116325c56d38157ec2df76150951e.tar.bz2 spark-a5f1f6a907b116325c56d38157ec2df76150951e.zip