From d012cfa0e8b29a1a7412bdff41eb159c4afe7d34 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Sat, 26 Jan 2013 22:48:39 -0800 Subject: Update spark_ec2.py to use new spark-ec2 scripts --- ec2/deploy.generic/root/spark-ec2/ec2-variables.sh | 11 ++++++++ ec2/spark_ec2.py | 33 ++++++++++++++-------- 2 files changed, 32 insertions(+), 12 deletions(-) create mode 100644 ec2/deploy.generic/root/spark-ec2/ec2-variables.sh diff --git a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh new file mode 100644 index 0000000000..948cb5b1ec --- /dev/null +++ b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# These variables are automatically filled in by the mesos-ec2 script. +export MESOS_MASTERS="{{master_list}}" +export MESOS_SLAVES="{{slave_list}}" +export MESOS_ZOO_LIST="{{zoo_list}}" +export MESOS_HDFS_DATA_DIRS="{{hdfs_data_dirs}}" +export MESOS_MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" +export MESOS_SPARK_LOCAL_DIRS="{{spark_local_dirs}}" +export MODULES="{{modules}}" +export SWAP="{{swap}}" diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index a5384d3bda..f2385b5b56 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -358,25 +358,31 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): # Deploy configuration files and run setup scripts on a newly launched # or started EC2 cluster. def setup_cluster(conn, master_nodes, slave_nodes, zoo_nodes, opts, deploy_ssh_key): - print "Deploying files to master..." - deploy_files(conn, "deploy.generic", opts, master_nodes, slave_nodes, zoo_nodes) + if opts.cluster_type == "mesos": + modules = ['ephemeral-hdfs', 'persistent-hdfs', 'mesos'] + elif opts.cluster_type == "standalone": + modules = ['ephemeral-hdfs', 'persistent-hdfs', 'spark-standalone'] + master = master_nodes[0].public_dns_name if deploy_ssh_key: print "Copying SSH key %s to master..." % opts.identity_file ssh(master, opts, 'mkdir -p ~/.ssh') scp(master, opts, opts.identity_file, '~/.ssh/id_rsa') ssh(master, opts, 'chmod 600 ~/.ssh/id_rsa') + + # NOTE: We should clone the repository before running deploy_files to prevent + # ec2-variables.sh from being overwritten + ssh(master, opts, "rm -rf spark-ec2 && git clone https://github.com/shivaram/spark-ec2.git") + print "Deploying files to master..." + deploy_files(conn, "deploy.generic", opts, master_nodes, slave_nodes, + zoo_nodes, modules) print "Running setup on master..." - if opts.cluster_type == "mesos": - setup_mesos_cluster(master, opts) - elif opts.cluster_type == "standalone": - setup_standalone_cluster(master, slave_nodes, opts) + setup_spark_cluster(master, opts) print "Done!" -def setup_mesos_cluster(master, opts): - ssh(master, opts, "chmod u+x mesos-ec2/setup") - ssh(master, opts, "mesos-ec2/setup %s %s %s %s" % - ("generic", "none", "master", opts.swap)) +def setup_spark_cluster(master, opts): + ssh(master, opts, "chmod u+x spark-ec2/setup.sh") + ssh(master, opts, "spark-ec2/setup.sh") def setup_standalone_cluster(master, slave_nodes, opts): slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes]) @@ -427,7 +433,8 @@ def get_num_disks(instance_type): # cluster (e.g. lists of masters and slaves). Files are only deployed to # the first master instance in the cluster, and we expect the setup # script to be run on that instance to copy them to other nodes. -def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, zoo_nodes): +def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, zoo_nodes, + modules): active_master = master_nodes[0].public_dns_name num_disks = get_num_disks(opts.instance_type) @@ -459,7 +466,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, zoo_nodes): "cluster_url": cluster_url, "hdfs_data_dirs": hdfs_data_dirs, "mapred_local_dirs": mapred_local_dirs, - "spark_local_dirs": spark_local_dirs + "spark_local_dirs": spark_local_dirs, + "swap": str(opts.swap), + "modules": '\n'.join(modules) } # Create a temp directory in which we will place all the files to be -- cgit v1.2.3 From 0243b081ce4348c3d2955f2c16c0d3a61620be34 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Sun, 27 Jan 2013 00:25:29 -0800 Subject: Fix swap variable name --- ec2/deploy.generic/root/spark-ec2/ec2-variables.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh index 948cb5b1ec..166a884c88 100644 --- a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -8,4 +8,4 @@ export MESOS_HDFS_DATA_DIRS="{{hdfs_data_dirs}}" export MESOS_MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" export MESOS_SPARK_LOCAL_DIRS="{{spark_local_dirs}}" export MODULES="{{modules}}" -export SWAP="{{swap}}" +export SWAP_MB="{{swap}}" -- cgit v1.2.3 From dc9d3ab6ed7ec2122ec9fdd248e236393601725c Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Sun, 27 Jan 2013 00:26:00 -0800 Subject: Add option to start ganglia. Also enable Hadoop ports even if cluster type is not mesos --- ec2/spark_ec2.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index f2385b5b56..dfccb6c238 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -84,6 +84,9 @@ def parse_args(): "maximum price (in dollars)") parser.add_option("-c", "--cluster-type", default="mesos", help="'mesos' for a mesos cluster, 'standalone' for a standalone spark cluster (default: mesos)") + parser.add_option("-g", "--ganglia", action="store_true", default=False, + help="Setup ganglia monitoring for the cluster. NOTE: The ganglia " + + "monitoring page will be publicly accessible") parser.add_option("-u", "--user", default="root", help="The ssh user you want to connect as (default: root)") parser.add_option("--delete-groups", action="store_true", default=False, @@ -164,22 +167,23 @@ def launch_cluster(conn, opts, cluster_name): master_group.authorize(src_group=zoo_group) master_group.authorize('tcp', 22, 22, '0.0.0.0/0') master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') + master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') + master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') + master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') if opts.cluster_type == "mesos": - master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0') - master_group.authorize('tcp', 50070, 50070, '0.0.0.0/0') - master_group.authorize('tcp', 60070, 60070, '0.0.0.0/0') master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') + if opts.ganglia: + master_group.authorize('tcp', 80, 80, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) slave_group.authorize(src_group=zoo_group) slave_group.authorize('tcp', 22, 22, '0.0.0.0/0') slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0') - if opts.cluster_type == "mesos": - slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') - slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') - slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') - slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') + slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0') + slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0') + slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0') + slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0') if zoo_group.rules == []: # Group was just now created zoo_group.authorize(src_group=master_group) zoo_group.authorize(src_group=slave_group) @@ -363,6 +367,9 @@ def setup_cluster(conn, master_nodes, slave_nodes, zoo_nodes, opts, deploy_ssh_k elif opts.cluster_type == "standalone": modules = ['ephemeral-hdfs', 'persistent-hdfs', 'spark-standalone'] + if opts.ganglia: + modules.append('ganglia') + master = master_nodes[0].public_dns_name if deploy_ssh_key: print "Copying SSH key %s to master..." % opts.identity_file -- cgit v1.2.3 From da44a391fb74fc008211d5ec122cdcfc1726bc32 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Mon, 28 Jan 2013 11:16:14 -0800 Subject: Add an option to use the old scripts --- ec2/spark_ec2.py | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index dfccb6c238..cafb7bf011 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -87,6 +87,9 @@ def parse_args(): parser.add_option("-g", "--ganglia", action="store_true", default=False, help="Setup ganglia monitoring for the cluster. NOTE: The ganglia " + "monitoring page will be publicly accessible") + parser.add_option("--mesos-scripts", action="store_true", default=False, + help="Use older mesos-ec2 scripts to setup the cluster. NOTE: Ganglia " + + "will not be setup with this option") parser.add_option("-u", "--user", default="root", help="The ssh user you want to connect as (default: root)") parser.add_option("--delete-groups", action="store_true", default=False, @@ -362,6 +365,13 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): # Deploy configuration files and run setup scripts on a newly launched # or started EC2 cluster. def setup_cluster(conn, master_nodes, slave_nodes, zoo_nodes, opts, deploy_ssh_key): + master = master_nodes[0].public_dns_name + if deploy_ssh_key: + print "Copying SSH key %s to master..." % opts.identity_file + ssh(master, opts, 'mkdir -p ~/.ssh') + scp(master, opts, opts.identity_file, '~/.ssh/id_rsa') + ssh(master, opts, 'chmod 600 ~/.ssh/id_rsa') + if opts.cluster_type == "mesos": modules = ['ephemeral-hdfs', 'persistent-hdfs', 'mesos'] elif opts.cluster_type == "standalone": @@ -370,32 +380,39 @@ def setup_cluster(conn, master_nodes, slave_nodes, zoo_nodes, opts, deploy_ssh_k if opts.ganglia: modules.append('ganglia') - master = master_nodes[0].public_dns_name - if deploy_ssh_key: - print "Copying SSH key %s to master..." % opts.identity_file - ssh(master, opts, 'mkdir -p ~/.ssh') - scp(master, opts, opts.identity_file, '~/.ssh/id_rsa') - ssh(master, opts, 'chmod 600 ~/.ssh/id_rsa') + if not opts.mesos_scripts: + # NOTE: We should clone the repository before running deploy_files to + # prevent ec2-variables.sh from being overwritten + ssh(master, opts, "rm -rf spark-ec2 && git clone https://github.com/shivaram/spark-ec2.git") - # NOTE: We should clone the repository before running deploy_files to prevent - # ec2-variables.sh from being overwritten - ssh(master, opts, "rm -rf spark-ec2 && git clone https://github.com/shivaram/spark-ec2.git") print "Deploying files to master..." deploy_files(conn, "deploy.generic", opts, master_nodes, slave_nodes, zoo_nodes, modules) + print "Running setup on master..." - setup_spark_cluster(master, opts) + if opts.mesos_scripts: + if opts.cluster_type == "mesos": + setup_mesos_cluster(master, opts) + elif opts.cluster_type == "standalone": + setup_standalone_cluster(master, slave_nodes, opts) + else: + setup_spark_cluster(master, opts) print "Done!" -def setup_spark_cluster(master, opts): - ssh(master, opts, "chmod u+x spark-ec2/setup.sh") - ssh(master, opts, "spark-ec2/setup.sh") +def setup_mesos_cluster(master, opts): + ssh(master, opts, "chmod u+x mesos-ec2/setup") + ssh(master, opts, "mesos-ec2/setup %s %s %s %s" % + ("generic", "none", "master", opts.swap)) def setup_standalone_cluster(master, slave_nodes, opts): slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes]) ssh(master, opts, "echo \"%s\" > spark/conf/slaves" % (slave_ips)) ssh(master, opts, "/root/spark/bin/start-all.sh") +def setup_spark_cluster(master, opts): + ssh(master, opts, "chmod u+x spark-ec2/setup.sh") + ssh(master, opts, "spark-ec2/setup.sh") + # Wait for a whole cluster (masters, slaves and ZooKeeper) to start up def wait_for_cluster(conn, wait_secs, master_nodes, slave_nodes, zoo_nodes): -- cgit v1.2.3 From bf675ab4f90fdbea67e42a8df828ef15ed87a086 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Thu, 31 Jan 2013 21:43:45 -0800 Subject: Turn on ganglia by default --- ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index cafb7bf011..ce1072fd39 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -84,7 +84,7 @@ def parse_args(): "maximum price (in dollars)") parser.add_option("-c", "--cluster-type", default="mesos", help="'mesos' for a mesos cluster, 'standalone' for a standalone spark cluster (default: mesos)") - parser.add_option("-g", "--ganglia", action="store_true", default=False, + parser.add_option("-g", "--ganglia", action="store_true", default=True, help="Setup ganglia monitoring for the cluster. NOTE: The ganglia " + "monitoring page will be publicly accessible") parser.add_option("--mesos-scripts", action="store_true", default=False, -- cgit v1.2.3 From cc37601ecb72abd1351ed73b3be1fb517a31a4e1 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 4 Feb 2013 14:15:50 -0800 Subject: Adding an example with an OLAP roll-up --- .../src/main/scala/spark/examples/OLAPQuery.scala | 66 ++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 examples/src/main/scala/spark/examples/OLAPQuery.scala diff --git a/examples/src/main/scala/spark/examples/OLAPQuery.scala b/examples/src/main/scala/spark/examples/OLAPQuery.scala new file mode 100644 index 0000000000..ff3af01b17 --- /dev/null +++ b/examples/src/main/scala/spark/examples/OLAPQuery.scala @@ -0,0 +1,66 @@ +package spark.examples + +import spark.SparkContext +import spark.SparkContext._ +/** + * Executes a roll up-style query against Apache logs. + */ +object OLAPQuery { + val exampleApacheLogs = List( + """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg + | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; + | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR + | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR + | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 "" + | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.replace("\n", ""), + """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg + | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; + | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR + | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR + | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 "" + | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.replace("\n", "") + ) + + def main(args: Array[String]) { + if (args.length == 0) { + System.err.println("Usage: OLAPQuery [logFile]") + System.exit(1) + } + val sc = new SparkContext(args(0), "OLAP Query") + + val dataSet = + if (args.length == 2) sc.textFile(args(1)) + else sc.parallelize(exampleApacheLogs) + + val apache_log_regex = + """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r + + /** Tracks the total query count and number of aggregate bytes for a particular group. */ + class Stats(val count: Int, val numBytes: Int) extends Serializable { + def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes) + override def toString = "bytes=%s\tn=%s".format(numBytes, count) + } + + def extractKey(line: String): (String, String, String) = { + apache_log_regex findFirstIn line match { + case Some(apache_log_regex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => + if (user != "\"-\"") (ip, user, query) + else (null, null, null) + case _ => (null, null, null) + } + } + + def extractStats(line: String): Stats = { + apache_log_regex findFirstIn line match { + case Some(apache_log_regex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => + new Stats(1, bytes.toInt) + case _ => new Stats(1, 0) + } + } + + dataSet.map(line => (extractKey(line), extractStats(line))) + .reduceByKey((a, b) => a.merge(b)) + .collect().foreach{ + case (user, query) => println("%s\t%s".format(user, query))} + } +} -- cgit v1.2.3 From cfab1a35282c08cec351e468f6e57d1261f02d10 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Tue, 5 Feb 2013 14:31:46 -0800 Subject: add as many fetch requests as we can, subject to maxBytesInFlight --- core/src/main/scala/spark/storage/BlockManager.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/spark/storage/BlockManager.scala b/core/src/main/scala/spark/storage/BlockManager.scala index 9893e9625d..2e7db60841 100644 --- a/core/src/main/scala/spark/storage/BlockManager.scala +++ b/core/src/main/scala/spark/storage/BlockManager.scala @@ -585,7 +585,7 @@ class BlockManager( resultsGotten += 1 val result = results.take() bytesInFlight -= result.size - if (!fetchRequests.isEmpty && + while (!fetchRequests.isEmpty && (bytesInFlight == 0 || bytesInFlight + fetchRequests.front.size <= maxBytesInFlight)) { sendRequest(fetchRequests.dequeue()) } -- cgit v1.2.3 From 67df7f2fa2e09487fe8dcf39ab80606d95383ea5 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Tue, 5 Feb 2013 21:08:21 -0600 Subject: Add private, minor formatting. --- .../scala/spark/network/ConnectionManager.scala | 35 +++++++++------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/spark/network/ConnectionManager.scala b/core/src/main/scala/spark/network/ConnectionManager.scala index c7f226044d..b6ec664d7e 100644 --- a/core/src/main/scala/spark/network/ConnectionManager.scala +++ b/core/src/main/scala/spark/network/ConnectionManager.scala @@ -66,31 +66,28 @@ private[spark] class ConnectionManager(port: Int) extends Logging { val id = new ConnectionManagerId(Utils.localHostName, serverChannel.socket.getLocalPort) logInfo("Bound socket to port " + serverChannel.socket.getLocalPort() + " with id = " + id) - val thisInstance = this val selectorThread = new Thread("connection-manager-thread") { - override def run() { - thisInstance.run() - } + override def run() = ConnectionManager.this.run() } selectorThread.setDaemon(true) selectorThread.start() - def run() { + private def run() { try { while(!selectorThread.isInterrupted) { - for( (connectionManagerId, sendingConnection) <- connectionRequests) { + for ((connectionManagerId, sendingConnection) <- connectionRequests) { sendingConnection.connect() addConnection(sendingConnection) connectionRequests -= connectionManagerId } sendMessageRequests.synchronized { - while(!sendMessageRequests.isEmpty) { + while (!sendMessageRequests.isEmpty) { val (message, connection) = sendMessageRequests.dequeue connection.send(message) } } - while(!keyInterestChangeRequests.isEmpty) { + while (!keyInterestChangeRequests.isEmpty) { val (key, ops) = keyInterestChangeRequests.dequeue val connection = connectionsByKey(key) val lastOps = key.interestOps() @@ -126,14 +123,11 @@ private[spark] class ConnectionManager(port: Int) extends Logging { if (key.isValid) { if (key.isAcceptable) { acceptConnection(key) - } else - if (key.isConnectable) { + } else if (key.isConnectable) { connectionsByKey(key).asInstanceOf[SendingConnection].finishConnect() - } else - if (key.isReadable) { + } else if (key.isReadable) { connectionsByKey(key).read() - } else - if (key.isWritable) { + } else if (key.isWritable) { connectionsByKey(key).write() } } @@ -144,7 +138,7 @@ private[spark] class ConnectionManager(port: Int) extends Logging { } } - def acceptConnection(key: SelectionKey) { + private def acceptConnection(key: SelectionKey) { val serverChannel = key.channel.asInstanceOf[ServerSocketChannel] val newChannel = serverChannel.accept() val newConnection = new ReceivingConnection(newChannel, selector) @@ -154,7 +148,7 @@ private[spark] class ConnectionManager(port: Int) extends Logging { logInfo("Accepted connection from [" + newConnection.remoteAddress.getAddress + "]") } - def addConnection(connection: Connection) { + private def addConnection(connection: Connection) { connectionsByKey += ((connection.key, connection)) if (connection.isInstanceOf[SendingConnection]) { val sendingConnection = connection.asInstanceOf[SendingConnection] @@ -165,7 +159,7 @@ private[spark] class ConnectionManager(port: Int) extends Logging { connection.onClose(removeConnection) } - def removeConnection(connection: Connection) { + private def removeConnection(connection: Connection) { connectionsByKey -= connection.key if (connection.isInstanceOf[SendingConnection]) { val sendingConnection = connection.asInstanceOf[SendingConnection] @@ -222,16 +216,16 @@ private[spark] class ConnectionManager(port: Int) extends Logging { } } - def handleConnectionError(connection: Connection, e: Exception) { + private def handleConnectionError(connection: Connection, e: Exception) { logInfo("Handling connection error on connection to " + connection.remoteConnectionManagerId) removeConnection(connection) } - def changeConnectionKeyInterest(connection: Connection, ops: Int) { + private def changeConnectionKeyInterest(connection: Connection, ops: Int) { keyInterestChangeRequests += ((connection.key, ops)) } - def receiveMessage(connection: Connection, message: Message) { + private def receiveMessage(connection: Connection, message: Message) { val connectionManagerId = ConnectionManagerId.fromSocketAddress(message.senderAddress) logDebug("Received [" + message + "] from [" + connectionManagerId + "]") val runnable = new Runnable() { @@ -351,7 +345,6 @@ private[spark] class ConnectionManager(port: Int) extends Logging { private[spark] object ConnectionManager { def main(args: Array[String]) { - val manager = new ConnectionManager(9999) manager.onReceiveMessage((msg: Message, id: ConnectionManagerId) => { println("Received [" + msg + "] from [" + id + "]") -- cgit v1.2.3 From f2bc7480131c7468eb6d3bc6089a4deadf0a2a88 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Tue, 5 Feb 2013 21:23:36 -0600 Subject: Add RDD.coalesce. --- core/src/main/scala/spark/RDD.scala | 7 +++++++ core/src/main/scala/spark/api/java/JavaRDDLike.scala | 10 ++++++++++ core/src/test/scala/spark/CheckpointSuite.scala | 4 ++-- core/src/test/scala/spark/RDDSuite.scala | 8 ++++---- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index 9d6ea782bd..f0bc85865c 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -20,6 +20,7 @@ import spark.partial.BoundedDouble import spark.partial.CountEvaluator import spark.partial.GroupedCountEvaluator import spark.partial.PartialResult +import spark.rdd.CoalescedRDD import spark.rdd.CartesianRDD import spark.rdd.FilteredRDD import spark.rdd.FlatMappedRDD @@ -231,6 +232,12 @@ abstract class RDD[T: ClassManifest]( def distinct(): RDD[T] = distinct(splits.size) + /** + * Return a new RDD that is reduced into `numSplits` partitions. + */ + def coalesce(numSplits: Int = sc.defaultParallelism): RDD[T] = + new CoalescedRDD(this, numSplits) + /** * Return a sampled subset of this RDD. */ diff --git a/core/src/main/scala/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/spark/api/java/JavaRDDLike.scala index 60025b459c..295eaa57c0 100644 --- a/core/src/main/scala/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/spark/api/java/JavaRDDLike.scala @@ -130,6 +130,16 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround JavaPairRDD.fromRDD(rdd.cartesian(other.rdd)(other.classManifest))(classManifest, other.classManifest) + /** + * Return a new RDD that is reduced into the default number of partitions. + */ + def coalesce(): RDD[T] = coalesce(rdd.context.defaultParallelism) + + /** + * Return a new RDD that is reduced into `numSplits` partitions. + */ + def coalesce(numSplits: Int): RDD[T] = rdd.coalesce(numSplits) + /** * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. diff --git a/core/src/test/scala/spark/CheckpointSuite.scala b/core/src/test/scala/spark/CheckpointSuite.scala index 0b74607fb8..0d08fd2396 100644 --- a/core/src/test/scala/spark/CheckpointSuite.scala +++ b/core/src/test/scala/spark/CheckpointSuite.scala @@ -114,12 +114,12 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { } test("CoalescedRDD") { - testCheckpointing(new CoalescedRDD(_, 2)) + testCheckpointing(_.coalesce(2)) // Test whether size of CoalescedRDD reduce in size after parent RDD is checkpointed // Current implementation of CoalescedRDDSplit has transient reference to parent RDD, // so only the RDD will reduce in serialized size, not the splits. - testParentCheckpointing(new CoalescedRDD(_, 2), true, false) + testParentCheckpointing(_.coalesce(2), true, false) // Test that the CoalescedRDDSplit updates parent splits (CoalescedRDDSplit.parents) after // the parent RDD has been checkpointed and parent splits have been changed to HadoopSplits. diff --git a/core/src/test/scala/spark/RDDSuite.scala b/core/src/test/scala/spark/RDDSuite.scala index fe7deb10d6..ffa866de75 100644 --- a/core/src/test/scala/spark/RDDSuite.scala +++ b/core/src/test/scala/spark/RDDSuite.scala @@ -122,7 +122,7 @@ class RDDSuite extends FunSuite with LocalSparkContext { sc = new SparkContext("local", "test") val data = sc.parallelize(1 to 10, 10) - val coalesced1 = new CoalescedRDD(data, 2) + val coalesced1 = data.coalesce(2) assert(coalesced1.collect().toList === (1 to 10).toList) assert(coalesced1.glom().collect().map(_.toList).toList === List(List(1, 2, 3, 4, 5), List(6, 7, 8, 9, 10))) @@ -133,19 +133,19 @@ class RDDSuite extends FunSuite with LocalSparkContext { assert(coalesced1.dependencies.head.asInstanceOf[NarrowDependency[_]].getParents(1).toList === List(5, 6, 7, 8, 9)) - val coalesced2 = new CoalescedRDD(data, 3) + val coalesced2 = data.coalesce(3) assert(coalesced2.collect().toList === (1 to 10).toList) assert(coalesced2.glom().collect().map(_.toList).toList === List(List(1, 2, 3), List(4, 5, 6), List(7, 8, 9, 10))) - val coalesced3 = new CoalescedRDD(data, 10) + val coalesced3 = data.coalesce(10) assert(coalesced3.collect().toList === (1 to 10).toList) assert(coalesced3.glom().collect().map(_.toList).toList === (1 to 10).map(x => List(x)).toList) // If we try to coalesce into more partitions than the original RDD, it should just // keep the original number of partitions. - val coalesced4 = new CoalescedRDD(data, 20) + val coalesced4 = data.coalesce(20) assert(coalesced4.collect().toList === (1 to 10).toList) assert(coalesced4.glom().collect().map(_.toList).toList === (1 to 10).map(x => List(x)).toList) -- cgit v1.2.3 From f4d43cb43e64ec3436a129cf3f7a177374451060 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Tue, 5 Feb 2013 21:26:44 -0600 Subject: Remove unneeded zipWithIndex. Also rename r->rdd and remove unneeded extra type info. --- core/src/main/scala/spark/rdd/CoGroupedRDD.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala index 4893fe8d78..021118c8ba 100644 --- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala @@ -47,7 +47,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) @transient var deps_ = { val deps = new ArrayBuffer[Dependency[_]] - for ((rdd, index) <- rdds.zipWithIndex) { + for (rdd <- rdds) { if (rdd.partitioner == Some(part)) { logInfo("Adding one-to-one dependency with " + rdd) deps += new OneToOneDependency(rdd) @@ -65,12 +65,14 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) @transient var splits_ : Array[Split] = { val array = new Array[Split](part.numPartitions) for (i <- 0 until array.size) { - array(i) = new CoGroupSplit(i, rdds.zipWithIndex.map { case (r, j) => + // Each CoGroupSplit will have a dependency per contributing RDD + array(i) = new CoGroupSplit(i, rdds.zipWithIndex.map { case (rdd, j) => + // Assume each RDD contributed a single dependency, and get it dependencies(j) match { case s: ShuffleDependency[_, _] => - new ShuffleCoGroupSplitDep(s.shuffleId): CoGroupSplitDep + new ShuffleCoGroupSplitDep(s.shuffleId) case _ => - new NarrowCoGroupSplitDep(r, i, r.splits(i)): CoGroupSplitDep + new NarrowCoGroupSplitDep(rdd, i, rdd.splits(i)) } }.toList) } @@ -97,7 +99,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) } } for ((dep, depNum) <- split.deps.zipWithIndex) dep match { - case NarrowCoGroupSplitDep(rdd, itsSplitIndex, itsSplit) => { + case NarrowCoGroupSplitDep(rdd, _, itsSplit) => { // Read them from the parent for ((k, v) <- rdd.iterator(itsSplit, context)) { getSeq(k.asInstanceOf[K])(depNum) += v -- cgit v1.2.3 From a9c8d53cfa0bd09565799cec88344b286d7cc436 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Tue, 5 Feb 2013 22:14:18 -0600 Subject: Clean up RDDs, mainly to use getSplits. Also made sure clearDependencies() was calling super, to ensure the getSplits/getDependencies vars in the RDD base class get cleaned up. --- core/src/main/scala/spark/RDD.scala | 1 - core/src/main/scala/spark/rdd/BlockRDD.scala | 12 ++++-------- core/src/main/scala/spark/rdd/CartesianRDD.scala | 3 ++- core/src/main/scala/spark/rdd/CheckpointRDD.scala | 4 +--- core/src/main/scala/spark/rdd/CoGroupedRDD.scala | 21 +++++++-------------- core/src/main/scala/spark/rdd/CoalescedRDD.scala | 13 ++++++------- core/src/main/scala/spark/rdd/HadoopRDD.scala | 7 ++----- core/src/main/scala/spark/rdd/NewHadoopRDD.scala | 6 ++---- core/src/main/scala/spark/rdd/SampledRDD.scala | 8 +------- core/src/main/scala/spark/rdd/UnionRDD.scala | 8 ++------ core/src/main/scala/spark/rdd/ZippedRDD.scala | 6 +++--- 11 files changed, 30 insertions(+), 59 deletions(-) diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index f0bc85865c..5f99591fd5 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -656,7 +656,6 @@ abstract class RDD[T: ClassManifest]( */ private[spark] def markCheckpointed(checkpointRDD: RDD[_]) { clearDependencies() - dependencies_ = null splits_ = null deps = null // Forget the constructor argument for dependencies too } diff --git a/core/src/main/scala/spark/rdd/BlockRDD.scala b/core/src/main/scala/spark/rdd/BlockRDD.scala index 2c022f88e0..4214817c65 100644 --- a/core/src/main/scala/spark/rdd/BlockRDD.scala +++ b/core/src/main/scala/spark/rdd/BlockRDD.scala @@ -11,10 +11,6 @@ private[spark] class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[String]) extends RDD[T](sc, Nil) { - @transient var splits_ : Array[Split] = (0 until blockIds.size).map(i => { - new BlockRDDSplit(blockIds(i), i).asInstanceOf[Split] - }).toArray - @transient lazy val locations_ = { val blockManager = SparkEnv.get.blockManager /*val locations = blockIds.map(id => blockManager.getLocations(id))*/ @@ -22,7 +18,10 @@ class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[St HashMap(blockIds.zip(locations):_*) } - override def getSplits = splits_ + override def getSplits = (0 until blockIds.size).map(i => { + new BlockRDDSplit(blockIds(i), i).asInstanceOf[Split] + }).toArray + override def compute(split: Split, context: TaskContext): Iterator[T] = { val blockManager = SparkEnv.get.blockManager @@ -37,8 +36,5 @@ class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[St override def getPreferredLocations(split: Split) = locations_(split.asInstanceOf[BlockRDDSplit].blockId) - override def clearDependencies() { - splits_ = null - } } diff --git a/core/src/main/scala/spark/rdd/CartesianRDD.scala b/core/src/main/scala/spark/rdd/CartesianRDD.scala index 0f9ca06531..2f572a1941 100644 --- a/core/src/main/scala/spark/rdd/CartesianRDD.scala +++ b/core/src/main/scala/spark/rdd/CartesianRDD.scala @@ -35,7 +35,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest]( val numSplitsInRdd2 = rdd2.splits.size - override def getSplits: Array[Split] = { + override def getSplits = { // create the cross product split val array = new Array[Split](rdd1.splits.size * rdd2.splits.size) for (s1 <- rdd1.splits; s2 <- rdd2.splits) { @@ -66,6 +66,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest]( ) override def clearDependencies() { + super.clearDependencies() rdd1 = null rdd2 = null } diff --git a/core/src/main/scala/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/spark/rdd/CheckpointRDD.scala index 96b593ba7c..7cde523f11 100644 --- a/core/src/main/scala/spark/rdd/CheckpointRDD.scala +++ b/core/src/main/scala/spark/rdd/CheckpointRDD.scala @@ -20,7 +20,7 @@ class CheckpointRDD[T: ClassManifest](sc: SparkContext, val checkpointPath: Stri @transient val fs = new Path(checkpointPath).getFileSystem(sc.hadoopConfiguration) - @transient val splits_ : Array[Split] = { + override def getSplits = { val dirContents = fs.listStatus(new Path(checkpointPath)) val splitFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted val numSplits = splitFiles.size @@ -34,8 +34,6 @@ class CheckpointRDD[T: ClassManifest](sc: SparkContext, val checkpointPath: Stri checkpointData = Some(new RDDCheckpointData[T](this)) checkpointData.get.cpFile = Some(checkpointPath) - override def getSplits = splits_ - override def getPreferredLocations(split: Split): Seq[String] = { val status = fs.getFileStatus(new Path(checkpointPath)) val locations = fs.getFileBlockLocations(status, 0, status.getLen) diff --git a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala index 021118c8ba..d31ce13706 100644 --- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala @@ -43,26 +43,22 @@ private[spark] class CoGroupAggregator class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) extends RDD[(K, Seq[Seq[_]])](rdds.head.context, Nil) with Logging { - val aggr = new CoGroupAggregator + private val aggr = new CoGroupAggregator - @transient var deps_ = { - val deps = new ArrayBuffer[Dependency[_]] - for (rdd <- rdds) { + override def getDependencies = { + rdds.map { rdd => if (rdd.partitioner == Some(part)) { logInfo("Adding one-to-one dependency with " + rdd) - deps += new OneToOneDependency(rdd) + new OneToOneDependency(rdd) } else { logInfo("Adding shuffle dependency with " + rdd) val mapSideCombinedRDD = rdd.mapPartitions(aggr.combineValuesByKey(_), true) - deps += new ShuffleDependency[Any, ArrayBuffer[Any]](mapSideCombinedRDD, part) + new ShuffleDependency[Any, ArrayBuffer[Any]](mapSideCombinedRDD, part) } } - deps.toList } - override def getDependencies = deps_ - - @transient var splits_ : Array[Split] = { + override def getSplits = { val array = new Array[Split](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupSplit will have a dependency per contributing RDD @@ -79,8 +75,6 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) array } - override def getSplits = splits_ - override val partitioner = Some(part) override def compute(s: Split, context: TaskContext): Iterator[(K, Seq[Seq[_]])] = { @@ -117,8 +111,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) } override def clearDependencies() { - deps_ = null - splits_ = null + super.clearDependencies() rdds = null } } diff --git a/core/src/main/scala/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/spark/rdd/CoalescedRDD.scala index 4c57434b65..a1aa7a30b0 100644 --- a/core/src/main/scala/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoalescedRDD.scala @@ -31,7 +31,7 @@ class CoalescedRDD[T: ClassManifest]( maxPartitions: Int) extends RDD[T](prev.context, Nil) { // Nil since we implement getDependencies - override def getSplits: Array[Split] = { + override def getSplits = { val prevSplits = prev.splits if (prevSplits.length < maxPartitions) { prevSplits.map(_.index).map{idx => new CoalescedRDDSplit(idx, prev, Array(idx)) } @@ -50,14 +50,13 @@ class CoalescedRDD[T: ClassManifest]( } } - override def getDependencies: Seq[Dependency[_]] = List( - new NarrowDependency(prev) { - def getParents(id: Int): Seq[Int] = - splits(id).asInstanceOf[CoalescedRDDSplit].parentsIndices - } - ) + override def getDependencies = Seq(new NarrowDependency(prev) { + def getParents(id: Int): Seq[Int] = + splits(id).asInstanceOf[CoalescedRDDSplit].parentsIndices + }) override def clearDependencies() { + super.clearDependencies() prev = null } } diff --git a/core/src/main/scala/spark/rdd/HadoopRDD.scala b/core/src/main/scala/spark/rdd/HadoopRDD.scala index f547f53812..cd948de967 100644 --- a/core/src/main/scala/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/spark/rdd/HadoopRDD.scala @@ -45,10 +45,9 @@ class HadoopRDD[K, V]( extends RDD[(K, V)](sc, Nil) { // A Hadoop JobConf can be about 10 KB, which is pretty big, so broadcast it - val confBroadcast = sc.broadcast(new SerializableWritable(conf)) + private val confBroadcast = sc.broadcast(new SerializableWritable(conf)) - @transient - val splits_ : Array[Split] = { + override def getSplits = { val inputFormat = createInputFormat(conf) val inputSplits = inputFormat.getSplits(conf, minSplits) val array = new Array[Split](inputSplits.size) @@ -63,8 +62,6 @@ class HadoopRDD[K, V]( .asInstanceOf[InputFormat[K, V]] } - override def getSplits = splits_ - override def compute(theSplit: Split, context: TaskContext) = new Iterator[(K, V)] { val split = theSplit.asInstanceOf[HadoopSplit] var reader: RecordReader[K, V] = null diff --git a/core/src/main/scala/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/spark/rdd/NewHadoopRDD.scala index c3b155fcbd..2d000f5c68 100644 --- a/core/src/main/scala/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/spark/rdd/NewHadoopRDD.scala @@ -29,7 +29,7 @@ class NewHadoopRDD[K, V]( with HadoopMapReduceUtil { // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it - val confBroadcast = sc.broadcast(new SerializableWritable(conf)) + private val confBroadcast = sc.broadcast(new SerializableWritable(conf)) // private val serializableConf = new SerializableWritable(conf) private val jobtrackerId: String = { @@ -39,7 +39,7 @@ class NewHadoopRDD[K, V]( @transient private val jobId = new JobID(jobtrackerId, id) - @transient private val splits_ : Array[Split] = { + override def getSplits = { val inputFormat = inputFormatClass.newInstance val jobContext = newJobContext(conf, jobId) val rawSplits = inputFormat.getSplits(jobContext).toArray @@ -50,8 +50,6 @@ class NewHadoopRDD[K, V]( result } - override def getSplits = splits_ - override def compute(theSplit: Split, context: TaskContext) = new Iterator[(K, V)] { val split = theSplit.asInstanceOf[NewHadoopSplit] val conf = confBroadcast.value.value diff --git a/core/src/main/scala/spark/rdd/SampledRDD.scala b/core/src/main/scala/spark/rdd/SampledRDD.scala index e24ad23b21..81626d5009 100644 --- a/core/src/main/scala/spark/rdd/SampledRDD.scala +++ b/core/src/main/scala/spark/rdd/SampledRDD.scala @@ -19,13 +19,11 @@ class SampledRDD[T: ClassManifest]( seed: Int) extends RDD[T](prev) { - @transient var splits_ : Array[Split] = { + override def getSplits = { val rg = new Random(seed) firstParent[T].splits.map(x => new SampledRDDSplit(x, rg.nextInt)) } - override def getSplits = splits_ - override def getPreferredLocations(split: Split) = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDSplit].prev) @@ -48,8 +46,4 @@ class SampledRDD[T: ClassManifest]( firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } - - override def clearDependencies() { - splits_ = null - } } diff --git a/core/src/main/scala/spark/rdd/UnionRDD.scala b/core/src/main/scala/spark/rdd/UnionRDD.scala index 26a2d511f2..5ac24d2ffc 100644 --- a/core/src/main/scala/spark/rdd/UnionRDD.scala +++ b/core/src/main/scala/spark/rdd/UnionRDD.scala @@ -28,7 +28,7 @@ class UnionRDD[T: ClassManifest]( @transient var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies - override def getSplits: Array[Split] = { + override def getSplits = { val array = new Array[Split](rdds.map(_.splits.size).sum) var pos = 0 for (rdd <- rdds; split <- rdd.splits) { @@ -38,7 +38,7 @@ class UnionRDD[T: ClassManifest]( array } - override def getDependencies: Seq[Dependency[_]] = { + override def getDependencies = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { @@ -53,8 +53,4 @@ class UnionRDD[T: ClassManifest]( override def getPreferredLocations(s: Split): Seq[String] = s.asInstanceOf[UnionSplit[T]].preferredLocations() - - override def clearDependencies() { - rdds = null - } } diff --git a/core/src/main/scala/spark/rdd/ZippedRDD.scala b/core/src/main/scala/spark/rdd/ZippedRDD.scala index e5df6d8c72..a079720a93 100644 --- a/core/src/main/scala/spark/rdd/ZippedRDD.scala +++ b/core/src/main/scala/spark/rdd/ZippedRDD.scala @@ -29,10 +29,9 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest]( sc: SparkContext, var rdd1: RDD[T], var rdd2: RDD[U]) - extends RDD[(T, U)](sc, List(new OneToOneDependency(rdd1), new OneToOneDependency(rdd2))) - with Serializable { + extends RDD[(T, U)](sc, List(new OneToOneDependency(rdd1), new OneToOneDependency(rdd2))) { - override def getSplits: Array[Split] = { + override def getSplits = { if (rdd1.splits.size != rdd2.splits.size) { throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions") } @@ -54,6 +53,7 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest]( } override def clearDependencies() { + super.clearDependencies() rdd1 = null rdd2 = null } -- cgit v1.2.3 From dab81a85116c828231277bcfa3a7230b29bddaba Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 5 Feb 2013 20:53:14 -0800 Subject: Fixing to match Spark styleguide --- .../src/main/scala/spark/examples/LogQuery.scala | 66 ++++++++++++++++++++++ .../src/main/scala/spark/examples/OLAPQuery.scala | 66 ---------------------- 2 files changed, 66 insertions(+), 66 deletions(-) create mode 100644 examples/src/main/scala/spark/examples/LogQuery.scala delete mode 100644 examples/src/main/scala/spark/examples/OLAPQuery.scala diff --git a/examples/src/main/scala/spark/examples/LogQuery.scala b/examples/src/main/scala/spark/examples/LogQuery.scala new file mode 100644 index 0000000000..5330b8da94 --- /dev/null +++ b/examples/src/main/scala/spark/examples/LogQuery.scala @@ -0,0 +1,66 @@ +package spark.examples + +import spark.SparkContext +import spark.SparkContext._ +/** + * Executes a roll up-style query against Apache logs. + */ +object LogQuery { + val exampleApacheLogs = List( + """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg + | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; + | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR + | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR + | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 "" + | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.replace("\n", ""), + """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg + | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; + | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR + | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR + | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 "" + | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.replace("\n", "") + ) + + def main(args: Array[String]) { + if (args.length == 0) { + System.err.println("Usage: LogQuery [logFile]") + System.exit(1) + } + val sc = new SparkContext(args(0), "Log Query") + + val dataSet = + if (args.length == 2) sc.textFile(args(1)) + else sc.parallelize(exampleApacheLogs) + + val apacheLogRegex = + """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r + + /** Tracks the total query count and number of aggregate bytes for a particular group. */ + class Stats(val count: Int, val numBytes: Int) extends Serializable { + def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes) + override def toString = "bytes=%s\tn=%s".format(numBytes, count) + } + + def extractKey(line: String): (String, String, String) = { + apacheLogRegex.findFirstIn(line) match { + case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => + if (user != "\"-\"") (ip, user, query) + else (null, null, null) + case _ => (null, null, null) + } + } + + def extractStats(line: String): Stats = { + apacheLogRegex.findFirstIn(line) match { + case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => + new Stats(1, bytes.toInt) + case _ => new Stats(1, 0) + } + } + + dataSet.map(line => (extractKey(line), extractStats(line))) + .reduceByKey((a, b) => a.merge(b)) + .collect().foreach{ + case (user, query) => println("%s\t%s".format(user, query))} + } +} diff --git a/examples/src/main/scala/spark/examples/OLAPQuery.scala b/examples/src/main/scala/spark/examples/OLAPQuery.scala deleted file mode 100644 index ff3af01b17..0000000000 --- a/examples/src/main/scala/spark/examples/OLAPQuery.scala +++ /dev/null @@ -1,66 +0,0 @@ -package spark.examples - -import spark.SparkContext -import spark.SparkContext._ -/** - * Executes a roll up-style query against Apache logs. - */ -object OLAPQuery { - val exampleApacheLogs = List( - """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg - | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; - | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR - | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR - | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 "" - | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.replace("\n", ""), - """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg - | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; - | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR - | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR - | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 "" - | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.replace("\n", "") - ) - - def main(args: Array[String]) { - if (args.length == 0) { - System.err.println("Usage: OLAPQuery [logFile]") - System.exit(1) - } - val sc = new SparkContext(args(0), "OLAP Query") - - val dataSet = - if (args.length == 2) sc.textFile(args(1)) - else sc.parallelize(exampleApacheLogs) - - val apache_log_regex = - """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r - - /** Tracks the total query count and number of aggregate bytes for a particular group. */ - class Stats(val count: Int, val numBytes: Int) extends Serializable { - def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes) - override def toString = "bytes=%s\tn=%s".format(numBytes, count) - } - - def extractKey(line: String): (String, String, String) = { - apache_log_regex findFirstIn line match { - case Some(apache_log_regex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => - if (user != "\"-\"") (ip, user, query) - else (null, null, null) - case _ => (null, null, null) - } - } - - def extractStats(line: String): Stats = { - apache_log_regex findFirstIn line match { - case Some(apache_log_regex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => - new Stats(1, bytes.toInt) - case _ => new Stats(1, 0) - } - } - - dataSet.map(line => (extractKey(line), extractStats(line))) - .reduceByKey((a, b) => a.merge(b)) - .collect().foreach{ - case (user, query) => println("%s\t%s".format(user, query))} - } -} -- cgit v1.2.3 From 934a53c8b677df524315a75011b4c9396eb4b54e Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Tue, 5 Feb 2013 22:19:58 -0800 Subject: Change docs on 'reduce' since the merging of local reduces no longer preserves ordering, so the reduce function must also be commutative. --- core/src/main/scala/spark/RDD.scala | 2 +- core/src/main/scala/spark/api/java/JavaRDDLike.scala | 2 +- docs/scala-programming-guide.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index d15c6f7396..3ad3b4d233 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -330,7 +330,7 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial def toArray(): Array[T] = collect() /** - * Reduces the elements of this RDD using the specified associative binary operator. + * Reduces the elements of this RDD using the specified commutative and associative binary operator. */ def reduce(f: (T, T) => T): T = { val cleanF = sc.clean(f) diff --git a/core/src/main/scala/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/spark/api/java/JavaRDDLike.scala index 81d3a94466..6da89f518f 100644 --- a/core/src/main/scala/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/spark/api/java/JavaRDDLike.scala @@ -201,7 +201,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { } /** - * Reduces the elements of this RDD using the specified associative binary operator. + * Reduces the elements of this RDD using the specified commutative and associative binary operator. */ def reduce(f: JFunction2[T, T, T]): T = rdd.reduce(f) diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index 7350eca837..95de4f50bb 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -203,7 +203,7 @@ A complete list of transformations is available in the [RDD API doc](api/core/in ActionMeaning reduce(func) - Aggregate the elements of the dataset using a function func (which takes two arguments and returns one). The function should be associative so that it can be computed correctly in parallel. + Aggregate the elements of the dataset using a function func (which takes two arguments and returns one). The function should be commutative and associative so that it can be computed correctly in parallel. collect() -- cgit v1.2.3 From da52b16b38a5d6200ef2c6a3b7ba28ddf35a30f8 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 9 Feb 2013 10:11:54 -0600 Subject: Remove RDD.coalesce default arguments. --- core/src/main/scala/spark/RDD.scala | 3 +-- core/src/main/scala/spark/api/java/JavaRDDLike.scala | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index 5f99591fd5..dea52eb5c6 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -235,8 +235,7 @@ abstract class RDD[T: ClassManifest]( /** * Return a new RDD that is reduced into `numSplits` partitions. */ - def coalesce(numSplits: Int = sc.defaultParallelism): RDD[T] = - new CoalescedRDD(this, numSplits) + def coalesce(numSplits: Int): RDD[T] = new CoalescedRDD(this, numSplits) /** * Return a sampled subset of this RDD. diff --git a/core/src/main/scala/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/spark/api/java/JavaRDDLike.scala index 295eaa57c0..d3a4b62553 100644 --- a/core/src/main/scala/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/spark/api/java/JavaRDDLike.scala @@ -130,11 +130,6 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround JavaPairRDD.fromRDD(rdd.cartesian(other.rdd)(other.classManifest))(classManifest, other.classManifest) - /** - * Return a new RDD that is reduced into the default number of partitions. - */ - def coalesce(): RDD[T] = coalesce(rdd.context.defaultParallelism) - /** * Return a new RDD that is reduced into `numSplits` partitions. */ -- cgit v1.2.3 From 2a18cd826c42d7c6b35eaedde1e4c423b6a1b1e5 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 9 Feb 2013 10:12:04 -0600 Subject: Add back return types. --- core/src/main/scala/spark/rdd/BlockRDD.scala | 4 ++-- core/src/main/scala/spark/rdd/CartesianRDD.scala | 4 ++-- core/src/main/scala/spark/rdd/CheckpointRDD.scala | 2 +- core/src/main/scala/spark/rdd/CoGroupedRDD.scala | 4 ++-- core/src/main/scala/spark/rdd/CoalescedRDD.scala | 12 +++++++----- core/src/main/scala/spark/rdd/FilteredRDD.scala | 2 +- core/src/main/scala/spark/rdd/FlatMappedRDD.scala | 2 +- core/src/main/scala/spark/rdd/GlommedRDD.scala | 2 +- core/src/main/scala/spark/rdd/HadoopRDD.scala | 4 ++-- core/src/main/scala/spark/rdd/MapPartitionsRDD.scala | 2 +- .../src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala | 2 +- core/src/main/scala/spark/rdd/MappedRDD.scala | 2 +- core/src/main/scala/spark/rdd/NewHadoopRDD.scala | 4 ++-- core/src/main/scala/spark/rdd/PartitionPruningRDD.scala | 2 +- core/src/main/scala/spark/rdd/PipedRDD.scala | 2 +- core/src/main/scala/spark/rdd/SampledRDD.scala | 6 +++--- core/src/main/scala/spark/rdd/ShuffledRDD.scala | 4 +++- core/src/main/scala/spark/rdd/UnionRDD.scala | 4 ++-- core/src/main/scala/spark/rdd/ZippedRDD.scala | 2 +- 19 files changed, 35 insertions(+), 31 deletions(-) diff --git a/core/src/main/scala/spark/rdd/BlockRDD.scala b/core/src/main/scala/spark/rdd/BlockRDD.scala index 4214817c65..17989c5ce5 100644 --- a/core/src/main/scala/spark/rdd/BlockRDD.scala +++ b/core/src/main/scala/spark/rdd/BlockRDD.scala @@ -18,7 +18,7 @@ class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[St HashMap(blockIds.zip(locations):_*) } - override def getSplits = (0 until blockIds.size).map(i => { + override def getSplits: Array[Split] = (0 until blockIds.size).map(i => { new BlockRDDSplit(blockIds(i), i).asInstanceOf[Split] }).toArray @@ -33,7 +33,7 @@ class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[St } } - override def getPreferredLocations(split: Split) = + override def getPreferredLocations(split: Split): Seq[String] = locations_(split.asInstanceOf[BlockRDDSplit].blockId) } diff --git a/core/src/main/scala/spark/rdd/CartesianRDD.scala b/core/src/main/scala/spark/rdd/CartesianRDD.scala index 2f572a1941..41cbbd0093 100644 --- a/core/src/main/scala/spark/rdd/CartesianRDD.scala +++ b/core/src/main/scala/spark/rdd/CartesianRDD.scala @@ -35,7 +35,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest]( val numSplitsInRdd2 = rdd2.splits.size - override def getSplits = { + override def getSplits: Array[Split] = { // create the cross product split val array = new Array[Split](rdd1.splits.size * rdd2.splits.size) for (s1 <- rdd1.splits; s2 <- rdd2.splits) { @@ -45,7 +45,7 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest]( array } - override def getPreferredLocations(split: Split) = { + override def getPreferredLocations(split: Split): Seq[String] = { val currSplit = split.asInstanceOf[CartesianSplit] rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2) } diff --git a/core/src/main/scala/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/spark/rdd/CheckpointRDD.scala index 7cde523f11..3558d4673f 100644 --- a/core/src/main/scala/spark/rdd/CheckpointRDD.scala +++ b/core/src/main/scala/spark/rdd/CheckpointRDD.scala @@ -20,7 +20,7 @@ class CheckpointRDD[T: ClassManifest](sc: SparkContext, val checkpointPath: Stri @transient val fs = new Path(checkpointPath).getFileSystem(sc.hadoopConfiguration) - override def getSplits = { + override def getSplits: Array[Split] = { val dirContents = fs.listStatus(new Path(checkpointPath)) val splitFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted val numSplits = splitFiles.size diff --git a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala index d31ce13706..0a1e2cbee0 100644 --- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala @@ -45,7 +45,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) private val aggr = new CoGroupAggregator - override def getDependencies = { + override def getDependencies: Seq[Dependency[_]] = { rdds.map { rdd => if (rdd.partitioner == Some(part)) { logInfo("Adding one-to-one dependency with " + rdd) @@ -58,7 +58,7 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) } } - override def getSplits = { + override def getSplits: Array[Split] = { val array = new Array[Split](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupSplit will have a dependency per contributing RDD diff --git a/core/src/main/scala/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/spark/rdd/CoalescedRDD.scala index a1aa7a30b0..fcd26da43a 100644 --- a/core/src/main/scala/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoalescedRDD.scala @@ -31,7 +31,7 @@ class CoalescedRDD[T: ClassManifest]( maxPartitions: Int) extends RDD[T](prev.context, Nil) { // Nil since we implement getDependencies - override def getSplits = { + override def getSplits: Array[Split] = { val prevSplits = prev.splits if (prevSplits.length < maxPartitions) { prevSplits.map(_.index).map{idx => new CoalescedRDDSplit(idx, prev, Array(idx)) } @@ -50,10 +50,12 @@ class CoalescedRDD[T: ClassManifest]( } } - override def getDependencies = Seq(new NarrowDependency(prev) { - def getParents(id: Int): Seq[Int] = - splits(id).asInstanceOf[CoalescedRDDSplit].parentsIndices - }) + override def getDependencies: Seq[Dependency[_]] = { + Seq(new NarrowDependency(prev) { + def getParents(id: Int): Seq[Int] = + splits(id).asInstanceOf[CoalescedRDDSplit].parentsIndices + }) + } override def clearDependencies() { super.clearDependencies() diff --git a/core/src/main/scala/spark/rdd/FilteredRDD.scala b/core/src/main/scala/spark/rdd/FilteredRDD.scala index 6dbe235bd9..93e398ea2b 100644 --- a/core/src/main/scala/spark/rdd/FilteredRDD.scala +++ b/core/src/main/scala/spark/rdd/FilteredRDD.scala @@ -7,7 +7,7 @@ private[spark] class FilteredRDD[T: ClassManifest]( f: T => Boolean) extends RDD[T](prev) { - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override val partitioner = prev.partitioner // Since filter cannot change a partition's keys diff --git a/core/src/main/scala/spark/rdd/FlatMappedRDD.scala b/core/src/main/scala/spark/rdd/FlatMappedRDD.scala index 1b604c66e2..8c2a610593 100644 --- a/core/src/main/scala/spark/rdd/FlatMappedRDD.scala +++ b/core/src/main/scala/spark/rdd/FlatMappedRDD.scala @@ -9,7 +9,7 @@ class FlatMappedRDD[U: ClassManifest, T: ClassManifest]( f: T => TraversableOnce[U]) extends RDD[U](prev) { - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override def compute(split: Split, context: TaskContext) = firstParent[T].iterator(split, context).flatMap(f) diff --git a/core/src/main/scala/spark/rdd/GlommedRDD.scala b/core/src/main/scala/spark/rdd/GlommedRDD.scala index 051bffed19..70b9b4e34e 100644 --- a/core/src/main/scala/spark/rdd/GlommedRDD.scala +++ b/core/src/main/scala/spark/rdd/GlommedRDD.scala @@ -5,7 +5,7 @@ import spark.{RDD, Split, TaskContext} private[spark] class GlommedRDD[T: ClassManifest](prev: RDD[T]) extends RDD[Array[T]](prev) { - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override def compute(split: Split, context: TaskContext) = Array(firstParent[T].iterator(split, context).toArray).iterator diff --git a/core/src/main/scala/spark/rdd/HadoopRDD.scala b/core/src/main/scala/spark/rdd/HadoopRDD.scala index cd948de967..854993737b 100644 --- a/core/src/main/scala/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/spark/rdd/HadoopRDD.scala @@ -47,7 +47,7 @@ class HadoopRDD[K, V]( // A Hadoop JobConf can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = sc.broadcast(new SerializableWritable(conf)) - override def getSplits = { + override def getSplits: Array[Split] = { val inputFormat = createInputFormat(conf) val inputSplits = inputFormat.getSplits(conf, minSplits) val array = new Array[Split](inputSplits.size) @@ -106,7 +106,7 @@ class HadoopRDD[K, V]( } } - override def getPreferredLocations(split: Split) = { + override def getPreferredLocations(split: Split): Seq[String] = { // TODO: Filtering out "localhost" in case of file:// URLs val hadoopSplit = split.asInstanceOf[HadoopSplit] hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost") diff --git a/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala b/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala index 073f7d7d2a..7b0b4525c7 100644 --- a/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala +++ b/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala @@ -13,7 +13,7 @@ class MapPartitionsRDD[U: ClassManifest, T: ClassManifest]( override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override def compute(split: Split, context: TaskContext) = f(firstParent[T].iterator(split, context)) diff --git a/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala b/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala index 2ddc3d01b6..c6dc1080a9 100644 --- a/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala +++ b/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala @@ -15,7 +15,7 @@ class MapPartitionsWithSplitRDD[U: ClassManifest, T: ClassManifest]( preservesPartitioning: Boolean ) extends RDD[U](prev) { - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override val partitioner = if (preservesPartitioning) prev.partitioner else None diff --git a/core/src/main/scala/spark/rdd/MappedRDD.scala b/core/src/main/scala/spark/rdd/MappedRDD.scala index 5466c9c657..6074f411e3 100644 --- a/core/src/main/scala/spark/rdd/MappedRDD.scala +++ b/core/src/main/scala/spark/rdd/MappedRDD.scala @@ -6,7 +6,7 @@ private[spark] class MappedRDD[U: ClassManifest, T: ClassManifest](prev: RDD[T], f: T => U) extends RDD[U](prev) { - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override def compute(split: Split, context: TaskContext) = firstParent[T].iterator(split, context).map(f) diff --git a/core/src/main/scala/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/spark/rdd/NewHadoopRDD.scala index 2d000f5c68..345ae79d74 100644 --- a/core/src/main/scala/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/spark/rdd/NewHadoopRDD.scala @@ -39,7 +39,7 @@ class NewHadoopRDD[K, V]( @transient private val jobId = new JobID(jobtrackerId, id) - override def getSplits = { + override def getSplits: Array[Split] = { val inputFormat = inputFormatClass.newInstance val jobContext = newJobContext(conf, jobId) val rawSplits = inputFormat.getSplits(jobContext).toArray @@ -83,7 +83,7 @@ class NewHadoopRDD[K, V]( } } - override def getPreferredLocations(split: Split) = { + override def getPreferredLocations(split: Split): Seq[String] = { val theSplit = split.asInstanceOf[NewHadoopSplit] theSplit.serializableHadoopSplit.value.getLocations.filter(_ != "localhost") } diff --git a/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala index a50ce75171..d1553181c1 100644 --- a/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala +++ b/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala @@ -37,6 +37,6 @@ class PartitionPruningRDD[T: ClassManifest]( override def compute(split: Split, context: TaskContext) = firstParent[T].iterator( split.asInstanceOf[PartitionPruningRDDSplit].parentSplit, context) - override protected def getSplits = + override protected def getSplits: Array[Split] = getDependencies.head.asInstanceOf[PruneDependency[T]].partitions } diff --git a/core/src/main/scala/spark/rdd/PipedRDD.scala b/core/src/main/scala/spark/rdd/PipedRDD.scala index 6631f83510..56032a8659 100644 --- a/core/src/main/scala/spark/rdd/PipedRDD.scala +++ b/core/src/main/scala/spark/rdd/PipedRDD.scala @@ -27,7 +27,7 @@ class PipedRDD[T: ClassManifest]( // using a standard StringTokenizer (i.e. by spaces) def this(prev: RDD[T], command: String) = this(prev, PipedRDD.tokenize(command)) - override def getSplits = firstParent[T].splits + override def getSplits: Array[Split] = firstParent[T].splits override def compute(split: Split, context: TaskContext): Iterator[String] = { val pb = new ProcessBuilder(command) diff --git a/core/src/main/scala/spark/rdd/SampledRDD.scala b/core/src/main/scala/spark/rdd/SampledRDD.scala index 81626d5009..f2a144e2e0 100644 --- a/core/src/main/scala/spark/rdd/SampledRDD.scala +++ b/core/src/main/scala/spark/rdd/SampledRDD.scala @@ -19,15 +19,15 @@ class SampledRDD[T: ClassManifest]( seed: Int) extends RDD[T](prev) { - override def getSplits = { + override def getSplits: Array[Split] = { val rg = new Random(seed) firstParent[T].splits.map(x => new SampledRDDSplit(x, rg.nextInt)) } - override def getPreferredLocations(split: Split) = + override def getPreferredLocations(split: Split): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDSplit].prev) - override def compute(splitIn: Split, context: TaskContext) = { + override def compute(splitIn: Split, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDSplit] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with diff --git a/core/src/main/scala/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/spark/rdd/ShuffledRDD.scala index d396478673..bf69b5150b 100644 --- a/core/src/main/scala/spark/rdd/ShuffledRDD.scala +++ b/core/src/main/scala/spark/rdd/ShuffledRDD.scala @@ -22,7 +22,9 @@ class ShuffledRDD[K, V]( override val partitioner = Some(part) - override def getSplits = Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i)) + override def getSplits: Array[Split] = { + Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i)) + } override def compute(split: Split, context: TaskContext): Iterator[(K, V)] = { val shuffledId = dependencies.head.asInstanceOf[ShuffleDependency[K, V]].shuffleId diff --git a/core/src/main/scala/spark/rdd/UnionRDD.scala b/core/src/main/scala/spark/rdd/UnionRDD.scala index 5ac24d2ffc..ebc0068228 100644 --- a/core/src/main/scala/spark/rdd/UnionRDD.scala +++ b/core/src/main/scala/spark/rdd/UnionRDD.scala @@ -28,7 +28,7 @@ class UnionRDD[T: ClassManifest]( @transient var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies - override def getSplits = { + override def getSplits: Array[Split] = { val array = new Array[Split](rdds.map(_.splits.size).sum) var pos = 0 for (rdd <- rdds; split <- rdd.splits) { @@ -38,7 +38,7 @@ class UnionRDD[T: ClassManifest]( array } - override def getDependencies = { + override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { diff --git a/core/src/main/scala/spark/rdd/ZippedRDD.scala b/core/src/main/scala/spark/rdd/ZippedRDD.scala index a079720a93..1ce70268bb 100644 --- a/core/src/main/scala/spark/rdd/ZippedRDD.scala +++ b/core/src/main/scala/spark/rdd/ZippedRDD.scala @@ -31,7 +31,7 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest]( var rdd2: RDD[U]) extends RDD[(T, U)](sc, List(new OneToOneDependency(rdd1), new OneToOneDependency(rdd2))) { - override def getSplits = { + override def getSplits: Array[Split] = { if (rdd1.splits.size != rdd2.splits.size) { throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions") } -- cgit v1.2.3 From b7a1fb5c5dfe12d18fe9c4b1da1818a143cd247f Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Sat, 9 Feb 2013 12:14:11 -0800 Subject: Add commutative requirement for 'reduce' to Python docstring. --- python/pyspark/rdd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 4cda6cf661..6b6ab6abd9 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -274,8 +274,8 @@ class RDD(object): def reduce(self, f): """ - Reduces the elements of this RDD using the specified associative binary - operator. + Reduces the elements of this RDD using the specified commutative and + associative binary operator. >>> from operator import add >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add) -- cgit v1.2.3 From fb7599870f4e3ee4e5a1e3c6e74ac2eaa2aaabf0 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 9 Feb 2013 16:10:52 -0600 Subject: Fix JavaRDDLike.coalesce return type. --- core/src/main/scala/spark/api/java/JavaRDDLike.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/spark/api/java/JavaRDDLike.scala index d3a4b62553..9e52c224dd 100644 --- a/core/src/main/scala/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/spark/api/java/JavaRDDLike.scala @@ -133,7 +133,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround /** * Return a new RDD that is reduced into `numSplits` partitions. */ - def coalesce(numSplits: Int): RDD[T] = rdd.coalesce(numSplits) + def coalesce(numSplits: Int): JavaRDD[T] = rdd.coalesce(numSplits) /** * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements -- cgit v1.2.3 From 4619ee0787066da15628970bd55cb8cec31a372c Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 9 Feb 2013 20:05:42 -0600 Subject: Move JavaRDDLike.coalesce into the right places. --- core/src/main/scala/spark/api/java/JavaDoubleRDD.scala | 5 +++++ core/src/main/scala/spark/api/java/JavaPairRDD.scala | 5 +++++ core/src/main/scala/spark/api/java/JavaRDD.scala | 5 +++++ core/src/main/scala/spark/api/java/JavaRDDLike.scala | 5 ----- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala index 843e1bd18b..2810631b41 100644 --- a/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala @@ -52,6 +52,11 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav def filter(f: JFunction[Double, java.lang.Boolean]): JavaDoubleRDD = fromRDD(srdd.filter(x => f(x).booleanValue())) + /** + * Return a new RDD that is reduced into `numSplits` partitions. + */ + def coalesce(numSplits: Int): JavaDoubleRDD = fromRDD(srdd.coalesce(numSplits)) + /** * Return a sampled subset of this RDD. */ diff --git a/core/src/main/scala/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/spark/api/java/JavaPairRDD.scala index 8ce32e0e2f..8a123bdb47 100644 --- a/core/src/main/scala/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaPairRDD.scala @@ -62,6 +62,11 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif def filter(f: Function[(K, V), java.lang.Boolean]): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.filter(x => f(x).booleanValue())) + /** + * Return a new RDD that is reduced into `numSplits` partitions. + */ + def coalesce(numSplits: Int): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.coalesce(numSplits)) + /** * Return a sampled subset of this RDD. */ diff --git a/core/src/main/scala/spark/api/java/JavaRDD.scala b/core/src/main/scala/spark/api/java/JavaRDD.scala index ac31350ec3..23e7ae2726 100644 --- a/core/src/main/scala/spark/api/java/JavaRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaRDD.scala @@ -38,6 +38,11 @@ JavaRDDLike[T, JavaRDD[T]] { def filter(f: JFunction[T, java.lang.Boolean]): JavaRDD[T] = wrapRDD(rdd.filter((x => f(x).booleanValue()))) + /** + * Return a new RDD that is reduced into `numSplits` partitions. + */ + def coalesce(numSplits: Int): JavaRDD[T] = rdd.coalesce(numSplits) + /** * Return a sampled subset of this RDD. */ diff --git a/core/src/main/scala/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/spark/api/java/JavaRDDLike.scala index 9e52c224dd..60025b459c 100644 --- a/core/src/main/scala/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/spark/api/java/JavaRDDLike.scala @@ -130,11 +130,6 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround JavaPairRDD.fromRDD(rdd.cartesian(other.rdd)(other.classManifest))(classManifest, other.classManifest) - /** - * Return a new RDD that is reduced into `numSplits` partitions. - */ - def coalesce(numSplits: Int): JavaRDD[T] = rdd.coalesce(numSplits) - /** * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. -- cgit v1.2.3 From 4975dcdafc0e4b1ab8c6e91525e01a84a0a934aa Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Sun, 10 Feb 2013 12:55:47 -0800 Subject: Fixed a 404 -- missing '.html' --- docs/tuning.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tuning.md b/docs/tuning.md index 9aaa53cd65..738c530458 100644 --- a/docs/tuning.md +++ b/docs/tuning.md @@ -233,7 +233,7 @@ number of cores in your clusters. ## Broadcasting Large Variables -Using the [broadcast functionality](scala-programming-guide#broadcast-variables) +Using the [broadcast functionality](scala-programming-guide.html#broadcast-variables) available in `SparkContext` can greatly reduce the size of each serialized task, and the cost of launching a job over a cluster. If your tasks use any large object from the driver program inside of them (e.g. a static lookup table), consider turning it into a broadcast variable. -- cgit v1.2.3 From 131b56afd0ec20b92502e11acda77c6594380471 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 10 Feb 2013 13:28:31 -0800 Subject: Update issue tracker link in contributing guide. --- docs/contributing-to-spark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing-to-spark.md b/docs/contributing-to-spark.md index c6e01c62d8..14d0dc856b 100644 --- a/docs/contributing-to-spark.md +++ b/docs/contributing-to-spark.md @@ -15,7 +15,7 @@ The Spark team welcomes contributions in the form of GitHub pull requests. Here But first, make sure that you have [configured a spark-env.sh](configuration.html) with at least `SCALA_HOME`, as some of the tests try to spawn subprocesses using this. - Add new unit tests for your code. We use [ScalaTest](http://www.scalatest.org/) for testing. Just add a new Suite in `core/src/test`, or methods to an existing Suite. -- If you'd like to report a bug but don't have time to fix it, you can still post it to our [issues page](https://github.com/mesos/spark/issues), or email the [mailing list](http://www.spark-project.org/mailing-lists.html). +- If you'd like to report a bug but don't have time to fix it, you can still post it to our [issue tracker](https://spark-project.atlassian.net), or email the [mailing list](http://www.spark-project.org/mailing-lists.html). # Licensing of Contributions -- cgit v1.2.3 From 0b788b760bc3b2d4d986acb9f6f04592aca9be26 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Wed, 6 Feb 2013 21:56:00 -0800 Subject: Update Windows scripts to launch daemons with less RAM and fix a few other issues Conflicts: run2.cmd --- run2.cmd | 23 ++++++++++++++++++----- sbt/sbt.cmd | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/run2.cmd b/run2.cmd index 67f1e465e4..c913a5195e 100644 --- a/run2.cmd +++ b/run2.cmd @@ -11,9 +11,22 @@ set SPARK_HOME=%FWDIR% rem Load environment variables from conf\spark-env.cmd, if it exists if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd" +rem Test that an argument was given +if not "x%1"=="x" goto arg_given + echo Usage: run ^ [^] + goto exit +:arg_given + +set RUNNING_DAEMON=0 +if "%1"=="spark.deploy.master.Master" set RUNNING_DAEMON=1 +if "%1"=="spark.deploy.worker.Worker" set RUNNING_DAEMON=1 +if "x%SPARK_DAEMON_MEMORY%" == "x" set SPARK_DAEMON_MEMORY=512m +if "%RUNNING_DAEMON%"=="1" set SPARK_MEM=%SPARK_DAEMON_MEMORY% +if "%RUNNING_DAEMON%"=="1" set SPARK_JAVA_OPTS=%SPARK_DAEMON_JAVA_OPTS% + rem Check that SCALA_HOME has been specified if not "x%SCALA_HOME%"=="x" goto scala_exists - echo "SCALA_HOME is not set" + echo SCALA_HOME is not set goto exit :scala_exists @@ -40,10 +53,10 @@ rem Build up classpath set CLASSPATH=%SPARK_CLASSPATH%;%MESOS_CLASSPATH%;%FWDIR%conf;%CORE_DIR%\target\scala-%SCALA_VERSION%\classes set CLASSPATH=%CLASSPATH%;%CORE_DIR%\target\scala-%SCALA_VERSION%\test-classes;%CORE_DIR%\src\main\resources set CLASSPATH=%CLASSPATH%;%REPL_DIR%\target\scala-%SCALA_VERSION%\classes;%EXAMPLES_DIR%\target\scala-%SCALA_VERSION%\classes -for /R "%FWDIR%\lib_managed\jars" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j -for /R "%FWDIR%\lib_managed\bundles" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j -for /R "%REPL_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j -for /R "%PYSPARK_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j +set CLASSPATH=%CLASSPATH%;%FWDIR%lib_managed\jars\* +set CLASSPATH=%CLASSPATH%;%FWDIR%lib_managed\bundles\* +set CLASSPATH=%CLASSPATH%;%FWDIR%repl\lib\* +set CLASSPATH=%CLASSPATH%;%FWDIR%python\lib\* set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes rem Figure out whether to run our class with java or with the scala launcher. diff --git a/sbt/sbt.cmd b/sbt/sbt.cmd index 6b289ab447..ce3ae70174 100644 --- a/sbt/sbt.cmd +++ b/sbt/sbt.cmd @@ -2,4 +2,4 @@ set EXTRA_ARGS= if not "%MESOS_HOME%x"=="x" set EXTRA_ARGS=-Djava.library.path=%MESOS_HOME%\lib\java set SPARK_HOME=%~dp0.. -java -Xmx1200M -XX:MaxPermSize=200m %EXTRA_ARGS% -jar %SPARK_HOME%\sbt\sbt-launch-*.jar "%*" +java -Xmx1200M -XX:MaxPermSize=200m %EXTRA_ARGS% -jar %SPARK_HOME%\sbt\sbt-launch-0.11.3-2.jar "%*" -- cgit v1.2.3 From 8c66c4996220e7ea77aa9e307a744635b9576e5e Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Wed, 6 Feb 2013 14:14:14 -0800 Subject: Tweak web UI so that people don't get confused about master URL format Conflicts: core/src/main/twirl/spark/deploy/master/index.scala.html core/src/main/twirl/spark/deploy/worker/index.scala.html --- core/src/main/scala/spark/deploy/DeployMessage.scala | 9 ++++++--- core/src/main/scala/spark/deploy/master/Master.scala | 2 +- core/src/main/scala/spark/deploy/worker/Worker.scala | 2 +- core/src/main/twirl/spark/deploy/master/index.scala.html | 6 +++--- core/src/main/twirl/spark/deploy/worker/index.scala.html | 6 +++--- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/spark/deploy/DeployMessage.scala b/core/src/main/scala/spark/deploy/DeployMessage.scala index 35f40c6e91..9f05cb4f35 100644 --- a/core/src/main/scala/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/spark/deploy/DeployMessage.scala @@ -76,8 +76,11 @@ private[spark] case object RequestMasterState // Master to MasterWebUI private[spark] -case class MasterState(uri: String, workers: Array[WorkerInfo], activeJobs: Array[JobInfo], - completedJobs: Array[JobInfo]) +case class MasterState(host: String, port: Int, workers: Array[WorkerInfo], + activeJobs: Array[JobInfo], completedJobs: Array[JobInfo]) { + + def uri = "spark://" + host + ":" + port +} // WorkerWebUI to Worker private[spark] case object RequestWorkerState @@ -85,6 +88,6 @@ private[spark] case object RequestWorkerState // Worker to WorkerWebUI private[spark] -case class WorkerState(uri: String, workerId: String, executors: List[ExecutorRunner], +case class WorkerState(host: String, port: Int, workerId: String, executors: List[ExecutorRunner], finishedExecutors: List[ExecutorRunner], masterUrl: String, cores: Int, memory: Int, coresUsed: Int, memoryUsed: Int, masterWebUiUrl: String) diff --git a/core/src/main/scala/spark/deploy/master/Master.scala b/core/src/main/scala/spark/deploy/master/Master.scala index 92e7914b1b..5986281d97 100644 --- a/core/src/main/scala/spark/deploy/master/Master.scala +++ b/core/src/main/scala/spark/deploy/master/Master.scala @@ -131,7 +131,7 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } case RequestMasterState => { - sender ! MasterState(ip + ":" + port, workers.toArray, jobs.toArray, completedJobs.toArray) + sender ! MasterState(ip, port, workers.toArray, jobs.toArray, completedJobs.toArray) } } diff --git a/core/src/main/scala/spark/deploy/worker/Worker.scala b/core/src/main/scala/spark/deploy/worker/Worker.scala index 38547ec4f1..62f01776a9 100644 --- a/core/src/main/scala/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/spark/deploy/worker/Worker.scala @@ -140,7 +140,7 @@ private[spark] class Worker( masterDisconnected() case RequestWorkerState => { - sender ! WorkerState(ip + ":" + port, workerId, executors.values.toList, + sender ! WorkerState(ip, port, workerId, executors.values.toList, finishedExecutors.values.toList, masterUrl, cores, memory, coresUsed, memoryUsed, masterWebUiUrl) } diff --git a/core/src/main/twirl/spark/deploy/master/index.scala.html b/core/src/main/twirl/spark/deploy/master/index.scala.html index 285645c389..cb1651c7e1 100644 --- a/core/src/main/twirl/spark/deploy/master/index.scala.html +++ b/core/src/main/twirl/spark/deploy/master/index.scala.html @@ -2,13 +2,13 @@ @import spark.deploy.master._ @import spark.Utils -@spark.common.html.layout(title = "Spark Master on " + state.uri) { - +@spark.common.html.layout(title = "Spark Master on " + state.host) { +
    -
  • URL: spark://@(state.uri)
  • +
  • URL: @(state.uri)
  • Workers: @state.workers.size
  • Cores: @{state.workers.map(_.cores).sum} Total, @{state.workers.map(_.coresUsed).sum} Used
  • diff --git a/core/src/main/twirl/spark/deploy/worker/index.scala.html b/core/src/main/twirl/spark/deploy/worker/index.scala.html index 1d703dae58..c39f769a73 100644 --- a/core/src/main/twirl/spark/deploy/worker/index.scala.html +++ b/core/src/main/twirl/spark/deploy/worker/index.scala.html @@ -1,8 +1,8 @@ @(worker: spark.deploy.WorkerState) @import spark.Utils -@spark.common.html.layout(title = "Spark Worker on " + worker.uri) { - +@spark.common.html.layout(title = "Spark Worker on " + worker.host) { +
    @@ -10,12 +10,12 @@
  • ID: @worker.workerId
  • Master URL: @worker.masterUrl - (WebUI at @worker.masterWebUiUrl)
  • Cores: @worker.cores (@worker.coresUsed Used)
  • Memory: @{Utils.memoryMegabytesToString(worker.memory)} (@{Utils.memoryMegabytesToString(worker.memoryUsed)} Used)
+

Back to Master

-- cgit v1.2.3 From 05d2e94838d5b728df203d87708beaf3f4aa4c81 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Wed, 6 Feb 2013 14:34:46 -0800 Subject: Use a separate memory setting for standalone cluster daemons Conflicts: docs/_config.yml --- docs/configuration.md | 10 +++++++++- docs/spark-standalone.md | 8 ++++++++ run | 12 ++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index a7054b4321..f1ca77aa78 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -197,6 +197,14 @@ Apart from these, the following properties are also available, and may be useful poor data locality, but the default generally works well. + + spark.worker.timeout + 60 + + Number of seconds after which the standalone deploy master considers a worker lost if it + receives no heartbeats. + + spark.akka.frameSize 10 @@ -218,7 +226,7 @@ Apart from these, the following properties are also available, and may be useful spark.akka.timeout 20 - Communication timeout between Spark nodes. + Communication timeout between Spark nodes, in seconds. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index bf296221b8..3986c0c79d 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -115,6 +115,14 @@ You can optionally configure the cluster further by setting environment variable SPARK_WORKER_WEBUI_PORT Port for the worker web UI (default: 8081) + + SPARK_DAEMON_MEMORY + Memory to allocate to the Spark master and worker daemons themselves (default: 512m) + + + SPARK_DAEMON_JAVA_OPTS + JVM options for the Spark master and worker daemons themselves (default: none) + diff --git a/run b/run index a094629449..82b1da005a 100755 --- a/run +++ b/run @@ -13,6 +13,18 @@ if [ -e $FWDIR/conf/spark-env.sh ] ; then . $FWDIR/conf/spark-env.sh fi +if [ -z "$1" ]; then + echo "Usage: run []" >&2 + exit 1 +fi + +# If this is a standalone cluster daemon, reset SPARK_JAVA_OPTS and SPARK_MEM to reasonable +# values for that; it doesn't need a lot +if [ "$1" = "spark.deploy.master.Master" -o "$1" = "spark.deploy.worker.Worker" ]; then + SPARK_MEM=${SPARK_DAEMON_MEMORY:-512m} + SPARK_JAVA_OPTS=$SPARK_DAEMON_JAVA_OPTS # Empty by default +fi + if [ "$SPARK_LAUNCH_WITH_SCALA" == "1" ]; then if [ `command -v scala` ]; then RUNNER="scala" -- cgit v1.2.3 From 1b47fa275236657bea358f5c95d89f568c439395 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 6 Feb 2013 09:11:17 +0000 Subject: Detect hard crashes of workers using a heartbeat mechanism. Also fixes some issues in the rest of the code with detecting workers this way. Conflicts: core/src/main/scala/spark/deploy/master/Master.scala core/src/main/scala/spark/deploy/worker/Worker.scala core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala core/src/main/scala/spark/scheduler/cluster/StandaloneClusterMessage.scala core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala --- .../main/scala/spark/deploy/DeployMessage.scala | 3 ++- .../main/scala/spark/deploy/master/Master.scala | 29 ++++++++++++++++++++-- .../scala/spark/deploy/master/WorkerInfo.scala | 2 ++ .../main/scala/spark/deploy/worker/Worker.scala | 7 ++++++ .../scheduler/cluster/ExecutorLossReason.scala | 4 +-- .../cluster/SparkDeploySchedulerBackend.scala | 1 + .../cluster/StandaloneClusterMessage.scala | 3 +++ .../cluster/StandaloneSchedulerBackend.scala | 20 +++++++++++++-- 8 files changed, 62 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/spark/deploy/DeployMessage.scala b/core/src/main/scala/spark/deploy/DeployMessage.scala index 9f05cb4f35..1d88d4bc84 100644 --- a/core/src/main/scala/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/spark/deploy/DeployMessage.scala @@ -30,6 +30,8 @@ case class ExecutorStateChanged( exitStatus: Option[Int]) extends DeployMessage +private[spark] case class Heartbeat(workerId: String) extends DeployMessage + // Master to Worker private[spark] case class RegisteredWorker(masterWebUiUrl: String) extends DeployMessage @@ -45,7 +47,6 @@ private[spark] case class LaunchExecutor( sparkHome: String) extends DeployMessage - // Client to Master private[spark] case class RegisterJob(jobDescription: JobDescription) extends DeployMessage diff --git a/core/src/main/scala/spark/deploy/master/Master.scala b/core/src/main/scala/spark/deploy/master/Master.scala index 5986281d97..d985261600 100644 --- a/core/src/main/scala/spark/deploy/master/Master.scala +++ b/core/src/main/scala/spark/deploy/master/Master.scala @@ -3,6 +3,7 @@ package spark.deploy.master import akka.actor._ import akka.actor.Terminated import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientDisconnected, RemoteClientShutdown} +import akka.util.duration._ import java.text.SimpleDateFormat import java.util.Date @@ -16,6 +17,7 @@ import spark.util.AkkaUtils private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging { val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For job IDs + val WORKER_TIMEOUT = System.getProperty("spark.worker.timeout", "60").toLong * 1000 var nextJobNumber = 0 val workers = new HashSet[WorkerInfo] @@ -46,6 +48,7 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor // Listen for remote client disconnection events, since they don't go through Akka's watch() context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent]) startWebUi() + context.system.scheduler.schedule(0 millis, WORKER_TIMEOUT millis)(timeOutDeadWorkers()) } def startWebUi() { @@ -111,6 +114,15 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } } + case Heartbeat(workerId) => { + idToWorker.get(workerId) match { + case Some(workerInfo) => + workerInfo.lastHeartbeat = System.currentTimeMillis() + case None => + logWarning("Got heartbeat from unregistered worker " + workerId) + } + } + case Terminated(actor) => { // The disconnected actor could've been either a worker or a job; remove whichever of // those we have an entry for in the corresponding actor hashmap @@ -219,8 +231,9 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor actorToWorker -= worker.actor addressToWorker -= worker.actor.path.address for (exec <- worker.executors.values) { - exec.job.driver ! ExecutorStateChanged(exec.job.id, exec.id, ExecutorState.LOST, None, None) - exec.job.executors -= exec.id + logInfo("Telling job of lost executor: " + exec.id) + exec.job.driver ! ExecutorUpdated(exec.id, ExecutorState.LOST, Some("worker lost"), None) + exec.job.removeExecutor(exec) } } @@ -259,6 +272,18 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor nextJobNumber += 1 jobId } + + /** Check for, and remove, any timed-out workers */ + def timeOutDeadWorkers() { + // Copy the workers into an array so we don't modify the hashset while iterating through it + val expirationTime = System.currentTimeMillis() - WORKER_TIMEOUT + val toRemove = workers.filter(_.lastHeartbeat < expirationTime).toArray + for (worker <- toRemove) { + logWarning("Removing %s because we got no heartbeat in %d seconds".format( + worker.id, WORKER_TIMEOUT)) + removeWorker(worker) + } + } } private[spark] object Master { diff --git a/core/src/main/scala/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/spark/deploy/master/WorkerInfo.scala index 5a7f5fef8a..2e467007a0 100644 --- a/core/src/main/scala/spark/deploy/master/WorkerInfo.scala +++ b/core/src/main/scala/spark/deploy/master/WorkerInfo.scala @@ -18,6 +18,8 @@ private[spark] class WorkerInfo( var coresUsed = 0 var memoryUsed = 0 + var lastHeartbeat = System.currentTimeMillis() + def coresFree: Int = cores - coresUsed def memoryFree: Int = memory - memoryUsed diff --git a/core/src/main/scala/spark/deploy/worker/Worker.scala b/core/src/main/scala/spark/deploy/worker/Worker.scala index 62f01776a9..924935a5fd 100644 --- a/core/src/main/scala/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/spark/deploy/worker/Worker.scala @@ -2,6 +2,7 @@ package spark.deploy.worker import scala.collection.mutable.{ArrayBuffer, HashMap} import akka.actor.{ActorRef, Props, Actor, ActorSystem, Terminated} +import akka.util.duration._ import spark.{Logging, Utils} import spark.util.AkkaUtils import spark.deploy._ @@ -26,6 +27,9 @@ private[spark] class Worker( val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For worker and executor IDs + // Send a heartbeat every (heartbeat timeout) / 4 milliseconds + val HEARTBEAT_MILLIS = System.getProperty("spark.worker.timeout", "60").toLong * 1000 / 4 + var master: ActorRef = null var masterWebUiUrl : String = "" val workerId = generateWorkerId() @@ -97,6 +101,9 @@ private[spark] class Worker( case RegisteredWorker(url) => masterWebUiUrl = url logInfo("Successfully registered with master") + context.system.scheduler.schedule(0 millis, HEARTBEAT_MILLIS millis) { + master ! Heartbeat(workerId) + } case RegisterWorkerFailed(message) => logError("Worker registration failed: " + message) diff --git a/core/src/main/scala/spark/scheduler/cluster/ExecutorLossReason.scala b/core/src/main/scala/spark/scheduler/cluster/ExecutorLossReason.scala index bba7de6a65..8bf838209f 100644 --- a/core/src/main/scala/spark/scheduler/cluster/ExecutorLossReason.scala +++ b/core/src/main/scala/spark/scheduler/cluster/ExecutorLossReason.scala @@ -12,10 +12,10 @@ class ExecutorLossReason(val message: String) { private[spark] case class ExecutorExited(val exitCode: Int) - extends ExecutorLossReason(ExecutorExitCode.explainExitCode(exitCode)) { + extends ExecutorLossReason(ExecutorExitCode.explainExitCode(exitCode)) { } private[spark] case class SlaveLost(_message: String = "Slave lost") - extends ExecutorLossReason(_message) { + extends ExecutorLossReason(_message) { } diff --git a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index 59ff8bcb90..3c3e83b138 100644 --- a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -67,6 +67,7 @@ private[spark] class SparkDeploySchedulerBackend( case None => SlaveLost(message) } logInfo("Executor %s removed: %s".format(executorId, message)) + removeExecutor(executorId, reason.toString) scheduler.executorLost(executorId, reason) } } diff --git a/core/src/main/scala/spark/scheduler/cluster/StandaloneClusterMessage.scala b/core/src/main/scala/spark/scheduler/cluster/StandaloneClusterMessage.scala index da7dcf4b6b..d766067824 100644 --- a/core/src/main/scala/spark/scheduler/cluster/StandaloneClusterMessage.scala +++ b/core/src/main/scala/spark/scheduler/cluster/StandaloneClusterMessage.scala @@ -37,3 +37,6 @@ object StatusUpdate { // Internal messages in driver private[spark] case object ReviveOffers extends StandaloneClusterMessage private[spark] case object StopDriver extends StandaloneClusterMessage + +private[spark] case class RemoveExecutor(executorId: String, reason: String) + extends StandaloneClusterMessage diff --git a/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index 082022be1c..4213eb8719 100644 --- a/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -68,6 +68,10 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor sender ! true context.stop(self) + case RemoveExecutor(executorId, reason) => + removeExecutor(executorId, reason) + sender ! true + case Terminated(actor) => actorToExecutorId.get(actor).foreach(removeExecutor(_, "Akka actor terminated")) @@ -100,7 +104,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor // Remove a disconnected slave from the cluster def removeExecutor(executorId: String, reason: String) { - logInfo("Slave " + executorId + " disconnected, so removing it") + logInfo("Executor " + executorId + " disconnected, so removing it") val numCores = freeCores(executorId) actorToExecutorId -= executorActor(executorId) addressToExecutorId -= executorAddress(executorId) @@ -139,7 +143,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor } } catch { case e: Exception => - throw new SparkException("Error stopping standalone scheduler's master actor", e) + throw new SparkException("Error stopping standalone scheduler's driver actor", e) } } @@ -148,6 +152,18 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor } override def defaultParallelism(): Int = math.max(totalCoreCount.get(), 2) + + // Called by backends + def removeExecutor(executorId: String, reason: String) { + try { + val timeout = 5.seconds + val future = driverActor.ask(RemoveExecutor(executorId, reason))(timeout) + Await.result(future, timeout) + } catch { + case e: Exception => + throw new SparkException("Error notifying standalone scheduler's driver actor", e) + } + } } private[spark] object StandaloneSchedulerBackend { -- cgit v1.2.3 From da8afbc77e5796d45686034db5560f18c057d3c9 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Wed, 6 Feb 2013 13:30:35 -0800 Subject: Some bug and formatting fixes to FT Conflicts: core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala --- .../main/scala/spark/deploy/client/Client.scala | 2 +- .../main/scala/spark/deploy/master/Master.scala | 4 ++-- .../cluster/SparkDeploySchedulerBackend.scala | 1 - .../cluster/StandaloneSchedulerBackend.scala | 24 ++++++++++++---------- .../mesos/CoarseMesosSchedulerBackend.scala | 6 +++++- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/core/src/main/scala/spark/deploy/client/Client.scala b/core/src/main/scala/spark/deploy/client/Client.scala index a63eee1233..e01181d1b2 100644 --- a/core/src/main/scala/spark/deploy/client/Client.scala +++ b/core/src/main/scala/spark/deploy/client/Client.scala @@ -107,7 +107,7 @@ private[spark] class Client( def stop() { if (actor != null) { try { - val timeout = 1.seconds + val timeout = 5.seconds val future = actor.ask(StopClient)(timeout) Await.result(future, timeout) } catch { diff --git a/core/src/main/scala/spark/deploy/master/Master.scala b/core/src/main/scala/spark/deploy/master/Master.scala index d985261600..a5de23261c 100644 --- a/core/src/main/scala/spark/deploy/master/Master.scala +++ b/core/src/main/scala/spark/deploy/master/Master.scala @@ -117,8 +117,8 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor case Heartbeat(workerId) => { idToWorker.get(workerId) match { case Some(workerInfo) => - workerInfo.lastHeartbeat = System.currentTimeMillis() - case None => + workerInfo.lastHeartbeat = System.currentTimeMillis() + case None => logWarning("Got heartbeat from unregistered worker " + workerId) } } diff --git a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index 3c3e83b138..e77355c6cd 100644 --- a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -68,6 +68,5 @@ private[spark] class SparkDeploySchedulerBackend( } logInfo("Executor %s removed: %s".format(executorId, message)) removeExecutor(executorId, reason.toString) - scheduler.executorLost(executorId, reason) } } diff --git a/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index 4213eb8719..d606432572 100644 --- a/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -104,16 +104,18 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor // Remove a disconnected slave from the cluster def removeExecutor(executorId: String, reason: String) { - logInfo("Executor " + executorId + " disconnected, so removing it") - val numCores = freeCores(executorId) - actorToExecutorId -= executorActor(executorId) - addressToExecutorId -= executorAddress(executorId) - executorActor -= executorId - executorHost -= executorId - freeCores -= executorId - executorHost -= executorId - totalCoreCount.addAndGet(-numCores) - scheduler.executorLost(executorId, SlaveLost(reason)) + if (executorActor.contains(executorId)) { + logInfo("Executor " + executorId + " disconnected, so removing it") + val numCores = freeCores(executorId) + actorToExecutorId -= executorActor(executorId) + addressToExecutorId -= executorAddress(executorId) + executorActor -= executorId + executorHost -= executorId + freeCores -= executorId + executorHost -= executorId + totalCoreCount.addAndGet(-numCores) + scheduler.executorLost(executorId, SlaveLost(reason)) + } } } @@ -153,7 +155,7 @@ class StandaloneSchedulerBackend(scheduler: ClusterScheduler, actorSystem: Actor override def defaultParallelism(): Int = math.max(totalCoreCount.get(), 2) - // Called by backends + // Called by subclasses when notified of a lost worker def removeExecutor(executorId: String, reason: String) { try { val timeout = 5.seconds diff --git a/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala index b481ec0a72..7caf06e917 100644 --- a/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala @@ -239,7 +239,11 @@ private[spark] class CoarseMesosSchedulerBackend( override def slaveLost(d: SchedulerDriver, slaveId: SlaveID) { logInfo("Mesos slave lost: " + slaveId.getValue) synchronized { - slaveIdsWithExecutors -= slaveId.getValue + if (slaveIdsWithExecutors.contains(slaveId.getValue)) { + // Note that the slave ID corresponds to the executor ID on that slave + slaveIdsWithExecutors -= slaveId.getValue + removeExecutor(slaveId.getValue, "Mesos slave lost") + } } } -- cgit v1.2.3 From f0b68c623c116540470e06967c1554855d16a500 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 5 Feb 2013 19:02:46 -0800 Subject: Initial cut at replacing K, V in Java files --- core/src/test/scala/spark/JavaAPISuite.java | 24 ++++++++++ .../spark/streaming/api/java/JavaDStreamLike.scala | 4 +- .../test/java/spark/streaming/JavaAPISuite.java | 56 ++++++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/spark/JavaAPISuite.java b/core/src/test/scala/spark/JavaAPISuite.java index 934e4c2f67..9ffe7c5f99 100644 --- a/core/src/test/scala/spark/JavaAPISuite.java +++ b/core/src/test/scala/spark/JavaAPISuite.java @@ -696,4 +696,28 @@ public class JavaAPISuite implements Serializable { JavaRDD recovered = sc.checkpointFile(rdd.getCheckpointFile().get()); Assert.assertEquals(Arrays.asList(1, 2, 3, 4, 5), recovered.collect()); } + + @Test + public void mapOnPairRDD() { + JavaRDD rdd1 = sc.parallelize(Arrays.asList(1,2,3,4)); + JavaPairRDD rdd2 = rdd1.map(new PairFunction() { + @Override + public Tuple2 call(Integer i) throws Exception { + return new Tuple2(i, i % 2); + } + }); + JavaPairRDD rdd3 = rdd2.map( + new PairFunction, Integer, Integer>() { + @Override + public Tuple2 call(Tuple2 in) throws Exception { + return new Tuple2(in._2(), in._1()); + } + }); + Assert.assertEquals(Arrays.asList( + new Tuple2(1, 1), + new Tuple2(0, 2), + new Tuple2(1, 3), + new Tuple2(0, 4)), rdd3.collect()); + + } } diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index b93cb7865a..39fe0d0ccc 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -59,8 +59,8 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable } /** Return a new DStream by applying a function to all elements of this DStream. */ - def map[K, V](f: PairFunction[T, K, V]): JavaPairDStream[K, V] = { - def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K, V]]] + def map[K2, V2](f: PairFunction[T, K2, V2]): JavaPairDStream[K2, V2] = { + def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K2, V2]]] new JavaPairDStream(dstream.map(f)(cm))(f.keyType(), f.valueType()) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 79d6093429..26ac82b71a 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -506,6 +506,62 @@ public class JavaAPISuite implements Serializable { new Tuple2("new york", 3), new Tuple2("new york", 1))); + @Test + public void testPairMap() { // Maps pair -> pair + List>> inputData = stringIntKVStream; + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, "california"), + new Tuple2(3, "california"), + new Tuple2(4, "new york"), + new Tuple2(1, "new york")), + Arrays.asList( + new Tuple2(5, "california"), + new Tuple2(5, "california"), + new Tuple2(3, "new york"), + new Tuple2(1, "new york"))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaPairDStream reversed = pairStream.map( + new PairFunction, Integer, String>() { + @Override + public Tuple2 call(Tuple2 in) throws Exception { + return new Tuple2(in._2(), in._1()); + } + }); + + JavaTestUtils.attachTestOutputStream(reversed); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + + @Test + public void testPairMap2() { // Maps pair -> single + List>> inputData = stringIntKVStream; + + List> expected = Arrays.asList( + Arrays.asList(1, 3, 4, 1), + Arrays.asList(5, 5, 3, 1)); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaDStream reversed = pairStream.map( + new Function, Integer>() { + @Override + public Integer call(Tuple2 in) throws Exception { + return in._2(); + } + }); + + JavaTestUtils.attachTestOutputStream(reversed); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testPairGroupByKey() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From 314d87a038d84c4ae9a6471ea19a5431153ea604 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 09:20:37 -0800 Subject: Indentation fix --- .../src/test/java/spark/streaming/JavaAPISuite.java | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 26ac82b71a..4cf9d115ae 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -511,16 +511,16 @@ public class JavaAPISuite implements Serializable { List>> inputData = stringIntKVStream; List>> expected = Arrays.asList( - Arrays.asList( - new Tuple2(1, "california"), - new Tuple2(3, "california"), - new Tuple2(4, "new york"), - new Tuple2(1, "new york")), - Arrays.asList( - new Tuple2(5, "california"), - new Tuple2(5, "california"), - new Tuple2(3, "new york"), - new Tuple2(1, "new york"))); + Arrays.asList( + new Tuple2(1, "california"), + new Tuple2(3, "california"), + new Tuple2(4, "new york"), + new Tuple2(1, "new york")), + Arrays.asList( + new Tuple2(5, "california"), + new Tuple2(5, "california"), + new Tuple2(3, "new york"), + new Tuple2(1, "new york"))); JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); -- cgit v1.2.3 From 20cf77054536acd9c064d6e7ffedce23a87fb6a5 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 09:21:06 -0800 Subject: Fix for flatmap --- .../spark/streaming/api/java/JavaDStreamLike.scala | 4 +-- .../test/java/spark/streaming/JavaAPISuite.java | 42 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index 39fe0d0ccc..9cc263930e 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -78,10 +78,10 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable * Return a new DStream by applying a function to all elements of this DStream, * and then flattening the results */ - def flatMap[K, V](f: PairFlatMapFunction[T, K, V]): JavaPairDStream[K, V] = { + def flatMap[K2, V2](f: PairFlatMapFunction[T, K2, V2]): JavaPairDStream[K2, V2] = { import scala.collection.JavaConverters._ def fn = (x: T) => f.apply(x).asScala - def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K, V]]] + def cm = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[Tuple2[K2, V2]]] new JavaPairDStream(dstream.flatMap(fn)(cm))(f.keyType(), f.valueType()) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 4cf9d115ae..ec4e5ae18b 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -562,6 +562,48 @@ public class JavaAPISuite implements Serializable { Assert.assertEquals(expected, result); } + @Test + public void testPairToPairFlatMapWithChangingTypes() { // Maps pair -> pair + List>> inputData = Arrays.asList( + Arrays.asList( + new Tuple2("hi", 1), + new Tuple2("ho", 2)), + Arrays.asList( + new Tuple2("hi", 1), + new Tuple2("ho", 2))); + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, "h"), + new Tuple2(1, "i"), + new Tuple2(2, "h"), + new Tuple2(2, "o")), + Arrays.asList( + new Tuple2(1, "h"), + new Tuple2(1, "i"), + new Tuple2(2, "h"), + new Tuple2(2, "o"))); + + JavaDStream> stream = + JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaPairDStream flatMapped = pairStream.flatMap( + new PairFlatMapFunction, Integer, String>() { + @Override + public Iterable> call(Tuple2 in) throws Exception { + List> out = new LinkedList>(); + for (Character s: in._1().toCharArray()) { + out.add(new Tuple2(in._2(), s.toString())); + } + return out; + } + }); + JavaTestUtils.attachTestOutputStream(flatMapped); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testPairGroupByKey() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From c65988bdc1b75e88e6df77df0b84fc3a34c5b028 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 09:51:57 -0800 Subject: Fix for MapPartitions --- .../spark/streaming/api/java/JavaDStreamLike.scala | 4 +- .../test/java/spark/streaming/JavaAPISuite.java | 67 +++++++++++++++++----- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index 9cc263930e..ec546c8190 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -100,8 +100,8 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable * of this DStream. Applying mapPartitions() to an RDD applies a function to each partition * of the RDD. */ - def mapPartitions[K, V](f: PairFlatMapFunction[java.util.Iterator[T], K, V]) - : JavaPairDStream[K, V] = { + def mapPartitions[K2, V2](f: PairFlatMapFunction[java.util.Iterator[T], K2, V2]) + : JavaPairDStream[K2, V2] = { def fn = (x: Iterator[T]) => asScalaIterator(f.apply(asJavaIterator(x)).iterator()) new JavaPairDStream(dstream.mapPartitions(fn))(f.keyType(), f.valueType()) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index ec4e5ae18b..67d82d546f 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -507,7 +507,7 @@ public class JavaAPISuite implements Serializable { new Tuple2("new york", 1))); @Test - public void testPairMap() { // Maps pair -> pair + public void testPairMap() { // Maps pair -> pair of different type List>> inputData = stringIntKVStream; List>> expected = Arrays.asList( @@ -538,6 +538,43 @@ public class JavaAPISuite implements Serializable { Assert.assertEquals(expected, result); } + @Test + public void testPairMapPartitions() { // Maps pair -> pair of different type + List>> inputData = stringIntKVStream; + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, "california"), + new Tuple2(3, "california"), + new Tuple2(4, "new york"), + new Tuple2(1, "new york")), + Arrays.asList( + new Tuple2(5, "california"), + new Tuple2(5, "california"), + new Tuple2(3, "new york"), + new Tuple2(1, "new york"))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + JavaPairDStream reversed = pairStream.mapPartitions( + new PairFlatMapFunction>, Integer, String>() { + @Override + public Iterable> call(Iterator> in) throws Exception { + LinkedList> out = new LinkedList>(); + while (in.hasNext()) { + Tuple2 next = in.next(); + out.add(new Tuple2(next._2(), next._1())); + } + return out; + } + }); + + JavaTestUtils.attachTestOutputStream(reversed); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testPairMap2() { // Maps pair -> single List>> inputData = stringIntKVStream; @@ -588,16 +625,16 @@ public class JavaAPISuite implements Serializable { JavaTestUtils.attachTestInputStream(ssc, inputData, 1); JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream flatMapped = pairStream.flatMap( - new PairFlatMapFunction, Integer, String>() { - @Override - public Iterable> call(Tuple2 in) throws Exception { - List> out = new LinkedList>(); - for (Character s: in._1().toCharArray()) { - out.add(new Tuple2(in._2(), s.toString())); - } - return out; - } - }); + new PairFlatMapFunction, Integer, String>() { + @Override + public Iterable> call(Tuple2 in) throws Exception { + List> out = new LinkedList>(); + for (Character s : in._1().toCharArray()) { + out.add(new Tuple2(in._2(), s.toString())); + } + return out; + } + }); JavaTestUtils.attachTestOutputStream(flatMapped); List>> result = JavaTestUtils.runStreams(ssc, 2, 2); @@ -668,7 +705,7 @@ public class JavaAPISuite implements Serializable { JavaPairDStream combined = pairStream.combineByKey( new Function() { - @Override + @Override public Integer call(Integer i) throws Exception { return i; } @@ -766,19 +803,19 @@ public class JavaAPISuite implements Serializable { JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream updated = pairStream.updateStateByKey( - new Function2, Optional, Optional>(){ + new Function2, Optional, Optional>() { @Override public Optional call(List values, Optional state) { int out = 0; if (state.isPresent()) { out = out + state.get(); } - for (Integer v: values) { + for (Integer v : values) { out = out + v; } return Optional.of(out); } - }); + }); JavaTestUtils.attachTestOutputStream(updated); List>> result = JavaTestUtils.runStreams(ssc, 3, 3); -- cgit v1.2.3 From 04786d07391c4052d6dc42ff0828a79a37bbbfdf Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 10:05:49 -0800 Subject: small fix --- streaming/src/test/java/spark/streaming/JavaAPISuite.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 67d82d546f..551d4f15e4 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -810,12 +810,12 @@ public class JavaAPISuite implements Serializable { if (state.isPresent()) { out = out + state.get(); } - for (Integer v : values) { + for (Integer v: values) { out = out + v; } return Optional.of(out); } - }); + }); JavaTestUtils.attachTestOutputStream(updated); List>> result = JavaTestUtils.runStreams(ssc, 3, 3); -- cgit v1.2.3 From d09c36065ca040044530a50f0392c92866b6d301 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 10:45:45 -0800 Subject: Using tuple swap() --- streaming/src/test/java/spark/streaming/JavaAPISuite.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 551d4f15e4..9bfcd83e4d 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -528,7 +528,7 @@ public class JavaAPISuite implements Serializable { new PairFunction, Integer, String>() { @Override public Tuple2 call(Tuple2 in) throws Exception { - return new Tuple2(in._2(), in._1()); + return in.swap(); } }); @@ -563,7 +563,7 @@ public class JavaAPISuite implements Serializable { LinkedList> out = new LinkedList>(); while (in.hasNext()) { Tuple2 next = in.next(); - out.add(new Tuple2(next._2(), next._1())); + out.add(next.swap()); } return out; } -- cgit v1.2.3 From ea08537143d58b79b3ae5d083e9b3a5647257da8 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Mon, 11 Feb 2013 13:23:50 -0800 Subject: Fixed an exponential recursion that could happen with doCheckpoint due to lack of memoization --- bagel/src/test/scala/bagel/BagelSuite.scala | 35 ++++++++++++++++++++++------- core/src/main/scala/spark/RDD.scala | 14 ++++++++---- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/bagel/src/test/scala/bagel/BagelSuite.scala b/bagel/src/test/scala/bagel/BagelSuite.scala index 3c2f9c4616..47829a431e 100644 --- a/bagel/src/test/scala/bagel/BagelSuite.scala +++ b/bagel/src/test/scala/bagel/BagelSuite.scala @@ -1,10 +1,8 @@ package spark.bagel import org.scalatest.{FunSuite, Assertions, BeforeAndAfter} -import org.scalatest.prop.Checkers -import org.scalacheck.Arbitrary._ -import org.scalacheck.Gen -import org.scalacheck.Prop._ +import org.scalatest.concurrent.Timeouts +import org.scalatest.time.SpanSugar._ import scala.collection.mutable.ArrayBuffer @@ -13,7 +11,7 @@ import spark._ class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable class TestMessage(val targetId: String) extends Message[String] with Serializable -class BagelSuite extends FunSuite with Assertions with BeforeAndAfter { +class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeouts { var sc: SparkContext = _ @@ -25,7 +23,7 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter { // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown System.clearProperty("spark.driver.port") } - + test("halting by voting") { sc = new SparkContext("local", "test") val verts = sc.parallelize(Array("a", "b", "c", "d").map(id => (id, new TestVertex(true, 0)))) @@ -36,8 +34,9 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter { (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) } - for ((id, vert) <- result.collect) + for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) + } } test("halting by message silence") { @@ -57,7 +56,27 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter { } (new TestVertex(self.active, self.age + 1), msgsOut) } - for ((id, vert) <- result.collect) + for ((id, vert) <- result.collect) { assert(vert.age === numSupersteps) + } + } + + test("large number of iterations") { + // This tests whether jobs with a large number of iterations finish in a reasonable time, + // because non-memoized recursion in RDD or DAGScheduler used to cause them to hang + failAfter(10 seconds) { + sc = new SparkContext("local", "test") + val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0)))) + val msgs = sc.parallelize(Array[(String, TestMessage)]()) + val numSupersteps = 50 + val result = + Bagel.run(sc, verts, msgs, sc.defaultParallelism) { + (self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) => + (new TestVertex(superstep < numSupersteps - 1, self.age + 1), Array[TestMessage]()) + } + for ((id, vert) <- result.collect) { + assert(vert.age === numSupersteps) + } + } } } diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index 6abb5c4792..f6e927a989 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -636,16 +636,22 @@ abstract class RDD[T: ClassManifest]( /** The [[spark.SparkContext]] that this RDD was created on. */ def context = sc + // Avoid handling doCheckpoint multiple times to prevent excessive recursion + private var doCheckpointCalled = false + /** * Performs the checkpointing of this RDD by saving this. It is called by the DAGScheduler * after a job using this RDD has completed (therefore the RDD has been materialized and * potentially stored in memory). doCheckpoint() is called recursively on the parent RDDs. */ private[spark] def doCheckpoint() { - if (checkpointData.isDefined) { - checkpointData.get.doCheckpoint() - } else { - dependencies.foreach(_.rdd.doCheckpoint()) + if (!doCheckpointCalled) { + doCheckpointCalled = true + if (checkpointData.isDefined) { + checkpointData.get.doCheckpoint() + } else { + dependencies.foreach(_.rdd.doCheckpoint()) + } } } -- cgit v1.2.3 From 582d31dff99c161a51e15497db983a4b5a6d4cdb Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Mon, 11 Feb 2013 13:24:54 -0800 Subject: Formatting fixes --- bagel/src/main/scala/spark/bagel/Bagel.scala | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/bagel/src/main/scala/spark/bagel/Bagel.scala b/bagel/src/main/scala/spark/bagel/Bagel.scala index 996ca2a877..fa0ba4a573 100644 --- a/bagel/src/main/scala/spark/bagel/Bagel.scala +++ b/bagel/src/main/scala/spark/bagel/Bagel.scala @@ -6,8 +6,8 @@ import spark.SparkContext._ import scala.collection.mutable.ArrayBuffer object Bagel extends Logging { - def run[K : Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, - C : Manifest, A : Manifest]( + def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, + C: Manifest, A: Manifest]( sc: SparkContext, vertices: RDD[(K, V)], messages: RDD[(K, M)], @@ -50,8 +50,7 @@ object Bagel extends Logging { verts } - def run[K : Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, - C : Manifest]( + def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( sc: SparkContext, vertices: RDD[(K, V)], messages: RDD[(K, M)], @@ -66,8 +65,7 @@ object Bagel extends Logging { addAggregatorArg[K, V, M, C](compute)) } - def run[K : Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, - C : Manifest]( + def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest]( sc: SparkContext, vertices: RDD[(K, V)], messages: RDD[(K, M)], @@ -82,7 +80,7 @@ object Bagel extends Logging { addAggregatorArg[K, V, M, C](compute)) } - def run[K : Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest]( + def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest]( sc: SparkContext, vertices: RDD[(K, V)], messages: RDD[(K, M)], @@ -100,7 +98,7 @@ object Bagel extends Logging { * Aggregates the given vertices using the given aggregator, if it * is specified. */ - private def agg[K, V <: Vertex, A : Manifest]( + private def agg[K, V <: Vertex, A: Manifest]( verts: RDD[(K, V)], aggregator: Option[Aggregator[V, A]] ): Option[A] = aggregator match { @@ -116,7 +114,7 @@ object Bagel extends Logging { * function. Returns the processed RDD, the number of messages * created, and the number of active vertices. */ - private def comp[K : Manifest, V <: Vertex, M <: Message[K], C]( + private def comp[K: Manifest, V <: Vertex, M <: Message[K], C]( sc: SparkContext, grouped: RDD[(K, (Seq[C], Seq[V]))], compute: (V, Option[C]) => (V, Array[M]) @@ -149,9 +147,7 @@ object Bagel extends Logging { * Converts a compute function that doesn't take an aggregator to * one that does, so it can be passed to Bagel.run. */ - private def addAggregatorArg[ - K : Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C - ]( + private def addAggregatorArg[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C]( compute: (V, Option[C], Int) => (V, Array[M]) ): (V, Option[C], Option[Nothing], Int) => (V, Array[M]) = { (vert: V, msgs: Option[C], aggregated: Option[Nothing], superstep: Int) => @@ -170,7 +166,7 @@ trait Aggregator[V, A] { def mergeAggregators(a: A, b: A): A } -class DefaultCombiner[M : Manifest] extends Combiner[M, Array[M]] with Serializable { +class DefaultCombiner[M: Manifest] extends Combiner[M, Array[M]] with Serializable { def createCombiner(msg: M): Array[M] = Array(msg) def mergeMsg(combiner: Array[M], msg: M): Array[M] = -- cgit v1.2.3 From 21df6ffc13c7ad5cc3158675560c8364735d376e Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Mon, 11 Feb 2013 17:43:18 -0800 Subject: SPARK-696: sortByKey should use 'ascending' parameter --- core/src/main/scala/spark/api/java/JavaPairRDD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/spark/api/java/JavaPairRDD.scala index 8a123bdb47..55dc755358 100644 --- a/core/src/main/scala/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaPairRDD.scala @@ -452,7 +452,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif */ def sortByKey(ascending: Boolean): JavaPairRDD[K, V] = { val comp = com.google.common.collect.Ordering.natural().asInstanceOf[Comparator[K]] - sortByKey(comp, true) + sortByKey(comp, ascending) } /** -- cgit v1.2.3 From 3f3e77f28b08fc1db110c3b14b2c90eaa6dca8ef Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 12 Feb 2013 13:57:57 -0800 Subject: STREAMING-50: Support transform workaround in JavaPairDStream This ports a useful workaround (the `transform` function) to JavaPairDStream. It is necessary to do things like sorting which are not supported yet in the core streaming API. --- .../spark/streaming/api/java/JavaPairDStream.scala | 34 +++++++++++++++- .../test/java/spark/streaming/JavaAPISuite.java | 45 ++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala index ef10c091ca..eb2495e3ac 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala @@ -8,11 +8,11 @@ import scala.collection.JavaConversions._ import spark.streaming._ import spark.streaming.StreamingContext._ import spark.api.java.function.{Function => JFunction, Function2 => JFunction2} -import spark.Partitioner +import spark.{RDD, Partitioner} import org.apache.hadoop.mapred.{JobConf, OutputFormat} import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} import org.apache.hadoop.conf.Configuration -import spark.api.java.JavaPairRDD +import spark.api.java.{JavaRDD, JavaPairRDD} import spark.storage.StorageLevel import com.google.common.base.Optional @@ -81,6 +81,36 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( def union(that: JavaPairDStream[K, V]): JavaPairDStream[K, V] = dstream.union(that.dstream) + /** + * Return a new DStream in which each RDD is generated by applying a function + * on each RDD of this DStream. + */ + def transform[K2, V2](transformFunc: JFunction[JavaPairRDD[K, V], JavaPairRDD[K2, V2]]): + JavaPairDStream[K2, V2] = { + implicit val cmk: ClassManifest[K2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K2]] + implicit val cmv: ClassManifest[V2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V2]] + def scalaTransform (in: RDD[(K, V)]): RDD[(K2, V2)] = + transformFunc.call(new JavaPairRDD[K, V](in)).rdd + dstream.transform(scalaTransform(_)) + } + + /** + * Return a new DStream in which each RDD is generated by applying a function + * on each RDD of this DStream. + */ + def transform[K2, V2](transformFunc: JFunction2[JavaPairRDD[K, V], Time, JavaPairRDD[K2, V2]]): + JavaPairDStream[K2, V2] = { + implicit val cmk: ClassManifest[K2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K2]] + implicit val cmv: ClassManifest[V2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V2]] + def scalaTransform (in: RDD[(K, V)], time: Time): RDD[(K2, V2)] = + transformFunc.call(new JavaPairRDD[K, V](in), time).rdd + dstream.transform(scalaTransform(_, _)) + } + // ======================================================================= // Methods only for PairDStream's // ======================================================================= diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 9bfcd83e4d..7b385f609d 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -11,6 +11,7 @@ import org.junit.Before; import org.junit.Test; import scala.Tuple2; import spark.HashPartitioner; +import spark.api.java.JavaPairRDD; import spark.api.java.JavaRDD; import spark.api.java.JavaSparkContext; import spark.api.java.function.*; @@ -872,6 +873,50 @@ public class JavaAPISuite implements Serializable { Assert.assertEquals(expected, result); } + @Test + public void testPairTransform() { + List>> inputData = Arrays.asList( + Arrays.asList( + new Tuple2(3, 5), + new Tuple2(1, 5), + new Tuple2(4, 5), + new Tuple2(2, 5)), + Arrays.asList( + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5), + new Tuple2(1, 5))); + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, 5), + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5)), + Arrays.asList( + new Tuple2(1, 5), + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream( + ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + + JavaPairDStream sorted = pairStream.transform( + new Function, JavaPairRDD>() { + @Override + public JavaPairRDD call(JavaPairRDD in) throws Exception { + return in.sortByKey(); + } + }); + + JavaTestUtils.attachTestOutputStream(sorted); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + @Test public void testMapValues() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From 8f18e7e863728734f927edbcb928a37cdccc4d63 Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 13 Feb 2013 13:05:13 -0800 Subject: include jobid in Executor commandline args --- core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala | 2 +- core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala index 4ef637090c..69f34e604a 100644 --- a/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala @@ -77,7 +77,7 @@ private[spark] class ExecutorRunner( val command = jobDesc.command val script = if (System.getProperty("os.name").startsWith("Windows")) "run.cmd" else "run"; val runScript = new File(sparkHome, script).getCanonicalPath - Seq(runScript, command.mainClass) ++ command.arguments.map(substituteVariables) + Seq(runScript, command.mainClass) ++ (command.arguments ++ Seq(jobId)).map(substituteVariables) } /** Spawn a thread that will redirect a given stream to a file */ diff --git a/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala b/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala index 224c126fdd..06229f39d9 100644 --- a/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala +++ b/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala @@ -68,8 +68,9 @@ private[spark] object StandaloneExecutorBackend { } def main(args: Array[String]) { - if (args.length != 4) { - System.err.println("Usage: StandaloneExecutorBackend ") + if (!(args.length >= 4)) { + //the reason we allow the last frameworkId argument is to make it easy to kill rogue executors + System.err.println("Usage: StandaloneExecutorBackend []") System.exit(1) } run(args(0), args(1), args(2), args(3).toInt) -- cgit v1.2.3 From 893bad90899c5c7edddc5e824f41a975c33571bf Mon Sep 17 00:00:00 2001 From: Imran Rashid Date: Wed, 13 Feb 2013 20:30:21 -0800 Subject: use appid instead of frameworkid; simplify stupid condition --- core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala b/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala index 06229f39d9..9a82c3054c 100644 --- a/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala +++ b/core/src/main/scala/spark/executor/StandaloneExecutorBackend.scala @@ -68,9 +68,9 @@ private[spark] object StandaloneExecutorBackend { } def main(args: Array[String]) { - if (!(args.length >= 4)) { + if (args.length < 4) { //the reason we allow the last frameworkId argument is to make it easy to kill rogue executors - System.err.println("Usage: StandaloneExecutorBackend []") + System.err.println("Usage: StandaloneExecutorBackend []") System.exit(1) } run(args(0), args(1), args(2), args(3).toInt) -- cgit v1.2.3 From c34b8ad2c59697b3e1f5034074e5de0d3b32b8f9 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 16 Feb 2013 00:54:03 -0600 Subject: Avoid a shuffle if combineByKey is passed the same partitioner. --- core/src/main/scala/spark/PairRDDFunctions.scala | 4 +++- core/src/test/scala/spark/ShuffleSuite.scala | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala index cc3cca2571..4c41519330 100644 --- a/core/src/main/scala/spark/PairRDDFunctions.scala +++ b/core/src/main/scala/spark/PairRDDFunctions.scala @@ -62,7 +62,9 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( } val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners) - if (mapSideCombine) { + if (Option(partitioner) == self.partitioner) { + self.mapPartitions(aggregator.combineValuesByKey(_), true) + } else if (mapSideCombine) { val mapSideCombined = self.mapPartitions(aggregator.combineValuesByKey(_), true) val partitioned = new ShuffledRDD[K, C](mapSideCombined, partitioner) partitioned.mapPartitions(aggregator.combineCombinersByKey(_), true) diff --git a/core/src/test/scala/spark/ShuffleSuite.scala b/core/src/test/scala/spark/ShuffleSuite.scala index 3493b9511f..d6efa3db43 100644 --- a/core/src/test/scala/spark/ShuffleSuite.scala +++ b/core/src/test/scala/spark/ShuffleSuite.scala @@ -98,6 +98,19 @@ class ShuffleSuite extends FunSuite with ShouldMatchers with LocalSparkContext { val sums = pairs.reduceByKey(_+_, 10).collect() assert(sums.toSet === Set((1, 7), (2, 1))) } + + test("reduceByKey with partitioner") { + sc = new SparkContext("local", "test") + val p = new Partitioner() { + def numPartitions = 2 + def getPartition(key: Any) = key.asInstanceOf[Int] + } + val pairs = rddToPairRDDFunctions(sc.parallelize(Array((1, 1), (1, 2), (1, 1), (0, 1)))).partitionBy(p) + val sums = pairs.reduceByKey(p, _+_) + println(sums.toDebugString) + assert(sums.collect().toSet === Set((1, 4), (0, 1))) + assert(sums.partitioner === Some(p)) + } test("join") { sc = new SparkContext("local", "test") -- cgit v1.2.3 From 43288732942a29e7c7c42de66eec6246ea27a13b Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 16 Feb 2013 01:16:40 -0600 Subject: Add assertion about dependencies. --- core/src/main/scala/spark/PairRDDFunctions.scala | 2 +- core/src/test/scala/spark/ShuffleSuite.scala | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala index 4c41519330..112beb2320 100644 --- a/core/src/main/scala/spark/PairRDDFunctions.scala +++ b/core/src/main/scala/spark/PairRDDFunctions.scala @@ -62,7 +62,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( } val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners) - if (Option(partitioner) == self.partitioner) { + if (self.partitioner == Some(partitioner)) { self.mapPartitions(aggregator.combineValuesByKey(_), true) } else if (mapSideCombine) { val mapSideCombined = self.mapPartitions(aggregator.combineValuesByKey(_), true) diff --git a/core/src/test/scala/spark/ShuffleSuite.scala b/core/src/test/scala/spark/ShuffleSuite.scala index d6efa3db43..50f2b294bf 100644 --- a/core/src/test/scala/spark/ShuffleSuite.scala +++ b/core/src/test/scala/spark/ShuffleSuite.scala @@ -1,6 +1,7 @@ package spark import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.HashSet import org.scalatest.FunSuite import org.scalatest.matchers.ShouldMatchers @@ -105,11 +106,20 @@ class ShuffleSuite extends FunSuite with ShouldMatchers with LocalSparkContext { def numPartitions = 2 def getPartition(key: Any) = key.asInstanceOf[Int] } - val pairs = rddToPairRDDFunctions(sc.parallelize(Array((1, 1), (1, 2), (1, 1), (0, 1)))).partitionBy(p) - val sums = pairs.reduceByKey(p, _+_) - println(sums.toDebugString) + val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 1), (0, 1))).partitionBy(p) + val sums = pairs.reduceByKey(_+_) assert(sums.collect().toSet === Set((1, 4), (0, 1))) assert(sums.partitioner === Some(p)) + // count the dependencies to make sure there is only 1 ShuffledRDD + val deps = new HashSet[RDD[_]]() + def visit(r: RDD[_]) { + for (dep <- r.dependencies) { + deps += dep.rdd + visit(dep.rdd) + } + } + visit(sums) + assert(deps.size === 2) // ShuffledRDD, ParallelCollection } test("join") { -- cgit v1.2.3 From ae2234687d9040b42619c374eadfd40c896d386d Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 16 Feb 2013 13:10:31 -0600 Subject: Make CoGroupedRDDs explicitly have the same key type. --- core/src/main/scala/spark/PairRDDFunctions.scala | 8 ++++---- core/src/main/scala/spark/rdd/CoGroupedRDD.scala | 4 ++-- core/src/test/scala/spark/CheckpointSuite.scala | 2 +- .../src/main/scala/spark/streaming/PairDStreamFunctions.scala | 2 +- .../src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala | 2 +- .../scala/spark/streaming/dstream/ReducedWindowedDStream.scala | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala index cc3cca2571..36b9880cd1 100644 --- a/core/src/main/scala/spark/PairRDDFunctions.scala +++ b/core/src/main/scala/spark/PairRDDFunctions.scala @@ -361,7 +361,7 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( throw new SparkException("Default partitioner cannot partition array keys.") } val cg = new CoGroupedRDD[K]( - Seq(self.asInstanceOf[RDD[(_, _)]], other.asInstanceOf[RDD[(_, _)]]), + Seq(self.asInstanceOf[RDD[(K, _)]], other.asInstanceOf[RDD[(K, _)]]), partitioner) val prfs = new PairRDDFunctions[K, Seq[Seq[_]]](cg)(classManifest[K], Manifests.seqSeqManifest) prfs.mapValues { @@ -380,9 +380,9 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( throw new SparkException("Default partitioner cannot partition array keys.") } val cg = new CoGroupedRDD[K]( - Seq(self.asInstanceOf[RDD[(_, _)]], - other1.asInstanceOf[RDD[(_, _)]], - other2.asInstanceOf[RDD[(_, _)]]), + Seq(self.asInstanceOf[RDD[(K, _)]], + other1.asInstanceOf[RDD[(K, _)]], + other2.asInstanceOf[RDD[(K, _)]]), partitioner) val prfs = new PairRDDFunctions[K, Seq[Seq[_]]](cg)(classManifest[K], Manifests.seqSeqManifest) prfs.mapValues { diff --git a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala index 0a1e2cbee0..868ee5a39f 100644 --- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala @@ -40,8 +40,8 @@ private[spark] class CoGroupAggregator { (b1, b2) => b1 ++ b2 }) with Serializable -class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(_, _)]], part: Partitioner) - extends RDD[(K, Seq[Seq[_]])](rdds.head.context, Nil) with Logging { +class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(K, _)]], part: Partitioner) + extends RDD[(K, Seq[Seq[_]])](rdds.head.context, Nil) { private val aggr = new CoGroupAggregator diff --git a/core/src/test/scala/spark/CheckpointSuite.scala b/core/src/test/scala/spark/CheckpointSuite.scala index 0d08fd2396..51ff966ae4 100644 --- a/core/src/test/scala/spark/CheckpointSuite.scala +++ b/core/src/test/scala/spark/CheckpointSuite.scala @@ -347,7 +347,7 @@ object CheckpointSuite { def cogroup[K, V](first: RDD[(K, V)], second: RDD[(K, V)], part: Partitioner) = { //println("First = " + first + ", second = " + second) new CoGroupedRDD[K]( - Seq(first.asInstanceOf[RDD[(_, _)]], second.asInstanceOf[RDD[(_, _)]]), + Seq(first.asInstanceOf[RDD[(K, _)]], second.asInstanceOf[RDD[(K, _)]]), part ).asInstanceOf[RDD[(K, Seq[Seq[V]])]] } diff --git a/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala b/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala index fbcf061126..5db3844f1d 100644 --- a/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala +++ b/streaming/src/main/scala/spark/streaming/PairDStreamFunctions.scala @@ -457,7 +457,7 @@ extends Serializable { ): DStream[(K, (Seq[V], Seq[W]))] = { val cgd = new CoGroupedDStream[K]( - Seq(self.asInstanceOf[DStream[(_, _)]], other.asInstanceOf[DStream[(_, _)]]), + Seq(self.asInstanceOf[DStream[(K, _)]], other.asInstanceOf[DStream[(K, _)]]), partitioner ) val pdfs = new PairDStreamFunctions[K, Seq[Seq[_]]](cgd)( diff --git a/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala index ddb1bf6b28..4ef4bb7de1 100644 --- a/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala +++ b/streaming/src/main/scala/spark/streaming/dstream/CoGroupedDStream.scala @@ -6,7 +6,7 @@ import spark.streaming.{Time, DStream, Duration} private[streaming] class CoGroupedDStream[K : ClassManifest]( - parents: Seq[DStream[(_, _)]], + parents: Seq[DStream[(K, _)]], partitioner: Partitioner ) extends DStream[(K, Seq[Seq[_]])](parents.head.ssc) { diff --git a/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala b/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala index 733d5c4a25..263655039c 100644 --- a/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala +++ b/streaming/src/main/scala/spark/streaming/dstream/ReducedWindowedDStream.scala @@ -101,7 +101,7 @@ class ReducedWindowedDStream[K: ClassManifest, V: ClassManifest]( val allRDDs = new ArrayBuffer[RDD[(K, V)]]() += previousWindowRDD ++= oldRDDs ++= newRDDs // Cogroup the reduced RDDs and merge the reduced values - val cogroupedRDD = new CoGroupedRDD[K](allRDDs.toSeq.asInstanceOf[Seq[RDD[(_, _)]]], partitioner) + val cogroupedRDD = new CoGroupedRDD[K](allRDDs.toSeq.asInstanceOf[Seq[RDD[(K, _)]]], partitioner) //val mergeValuesFunc = mergeValues(oldRDDs.size, newRDDs.size) _ val numOldValues = oldRDDs.size -- cgit v1.2.3 From e7713adb99f6b377c2c2b79dba08d2ccf5fa8909 Mon Sep 17 00:00:00 2001 From: Stephen Haberman Date: Sat, 16 Feb 2013 13:20:48 -0600 Subject: Move ParallelCollection into spark.rdd package. --- core/src/main/scala/spark/ParallelCollection.scala | 102 ----------- core/src/main/scala/spark/SparkContext.scala | 6 +- .../scala/spark/rdd/ParallelCollectionRDD.scala | 97 ++++++++++ .../scala/spark/ParallelCollectionSplitSuite.scala | 195 --------------------- .../spark/rdd/ParallelCollectionSplitSuite.scala | 195 +++++++++++++++++++++ 5 files changed, 295 insertions(+), 300 deletions(-) delete mode 100644 core/src/main/scala/spark/ParallelCollection.scala create mode 100644 core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala delete mode 100644 core/src/test/scala/spark/ParallelCollectionSplitSuite.scala create mode 100644 core/src/test/scala/spark/rdd/ParallelCollectionSplitSuite.scala diff --git a/core/src/main/scala/spark/ParallelCollection.scala b/core/src/main/scala/spark/ParallelCollection.scala deleted file mode 100644 index 10adcd53ec..0000000000 --- a/core/src/main/scala/spark/ParallelCollection.scala +++ /dev/null @@ -1,102 +0,0 @@ -package spark - -import scala.collection.immutable.NumericRange -import scala.collection.mutable.ArrayBuffer -import scala.collection.Map - -private[spark] class ParallelCollectionSplit[T: ClassManifest]( - val rddId: Long, - val slice: Int, - values: Seq[T]) - extends Split with Serializable { - - def iterator: Iterator[T] = values.iterator - - override def hashCode(): Int = (41 * (41 + rddId) + slice).toInt - - override def equals(other: Any): Boolean = other match { - case that: ParallelCollectionSplit[_] => (this.rddId == that.rddId && this.slice == that.slice) - case _ => false - } - - override val index: Int = slice -} - -private[spark] class ParallelCollection[T: ClassManifest]( - @transient sc: SparkContext, - @transient data: Seq[T], - numSlices: Int, - locationPrefs: Map[Int,Seq[String]]) - extends RDD[T](sc, Nil) { - // TODO: Right now, each split sends along its full data, even if later down the RDD chain it gets - // cached. It might be worthwhile to write the data to a file in the DFS and read it in the split - // instead. - // UPDATE: A parallel collection can be checkpointed to HDFS, which achieves this goal. - - @transient var splits_ : Array[Split] = { - val slices = ParallelCollection.slice(data, numSlices).toArray - slices.indices.map(i => new ParallelCollectionSplit(id, i, slices(i))).toArray - } - - override def getSplits = splits_ - - override def compute(s: Split, context: TaskContext) = - s.asInstanceOf[ParallelCollectionSplit[T]].iterator - - override def getPreferredLocations(s: Split): Seq[String] = { - locationPrefs.getOrElse(s.index, Nil) - } - - override def clearDependencies() { - splits_ = null - } -} - -private object ParallelCollection { - /** - * Slice a collection into numSlices sub-collections. One extra thing we do here is to treat Range - * collections specially, encoding the slices as other Ranges to minimize memory cost. This makes - * it efficient to run Spark over RDDs representing large sets of numbers. - */ - def slice[T: ClassManifest](seq: Seq[T], numSlices: Int): Seq[Seq[T]] = { - if (numSlices < 1) { - throw new IllegalArgumentException("Positive number of slices required") - } - seq match { - case r: Range.Inclusive => { - val sign = if (r.step < 0) { - -1 - } else { - 1 - } - slice(new Range( - r.start, r.end + sign, r.step).asInstanceOf[Seq[T]], numSlices) - } - case r: Range => { - (0 until numSlices).map(i => { - val start = ((i * r.length.toLong) / numSlices).toInt - val end = (((i+1) * r.length.toLong) / numSlices).toInt - new Range(r.start + start * r.step, r.start + end * r.step, r.step) - }).asInstanceOf[Seq[Seq[T]]] - } - case nr: NumericRange[_] => { // For ranges of Long, Double, BigInteger, etc - val slices = new ArrayBuffer[Seq[T]](numSlices) - val sliceSize = (nr.size + numSlices - 1) / numSlices // Round up to catch everything - var r = nr - for (i <- 0 until numSlices) { - slices += r.take(sliceSize).asInstanceOf[Seq[T]] - r = r.drop(sliceSize) - } - slices - } - case _ => { - val array = seq.toArray // To prevent O(n^2) operations for List etc - (0 until numSlices).map(i => { - val start = ((i * array.length.toLong) / numSlices).toInt - val end = (((i+1) * array.length.toLong) / numSlices).toInt - array.slice(start, end).toSeq - }) - } - } - } -} diff --git a/core/src/main/scala/spark/SparkContext.scala b/core/src/main/scala/spark/SparkContext.scala index 0efc00d5dd..047b57dc1f 100644 --- a/core/src/main/scala/spark/SparkContext.scala +++ b/core/src/main/scala/spark/SparkContext.scala @@ -39,7 +39,7 @@ import spark.broadcast._ import spark.deploy.LocalSparkCluster import spark.partial.ApproximateEvaluator import spark.partial.PartialResult -import rdd.{CheckpointRDD, HadoopRDD, NewHadoopRDD, UnionRDD} +import rdd.{CheckpointRDD, HadoopRDD, NewHadoopRDD, UnionRDD, ParallelCollectionRDD} import scheduler.{ResultTask, ShuffleMapTask, DAGScheduler, TaskScheduler} import spark.scheduler.local.LocalScheduler import spark.scheduler.cluster.{SparkDeploySchedulerBackend, SchedulerBackend, ClusterScheduler} @@ -216,7 +216,7 @@ class SparkContext( /** Distribute a local Scala collection to form an RDD. */ def parallelize[T: ClassManifest](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = { - new ParallelCollection[T](this, seq, numSlices, Map[Int, Seq[String]]()) + new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]()) } /** Distribute a local Scala collection to form an RDD. */ @@ -229,7 +229,7 @@ class SparkContext( * Create a new partition for each collection item. */ def makeRDD[T: ClassManifest](seq: Seq[(T, Seq[String])]): RDD[T] = { val indexToPrefs = seq.zipWithIndex.map(t => (t._2, t._1._2)).toMap - new ParallelCollection[T](this, seq.map(_._1), seq.size, indexToPrefs) + new ParallelCollectionRDD[T](this, seq.map(_._1), seq.size, indexToPrefs) } /** diff --git a/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala new file mode 100644 index 0000000000..e703794787 --- /dev/null +++ b/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala @@ -0,0 +1,97 @@ +package spark.rdd + +import scala.collection.immutable.NumericRange +import scala.collection.mutable.ArrayBuffer +import scala.collection.Map +import spark.{RDD, TaskContext, SparkContext, Split} + +private[spark] class ParallelCollectionSplit[T: ClassManifest]( + val rddId: Long, + val slice: Int, + values: Seq[T]) + extends Split with Serializable { + + def iterator: Iterator[T] = values.iterator + + override def hashCode(): Int = (41 * (41 + rddId) + slice).toInt + + override def equals(other: Any): Boolean = other match { + case that: ParallelCollectionSplit[_] => (this.rddId == that.rddId && this.slice == that.slice) + case _ => false + } + + override val index: Int = slice +} + +private[spark] class ParallelCollectionRDD[T: ClassManifest]( + @transient sc: SparkContext, + @transient data: Seq[T], + numSlices: Int, + locationPrefs: Map[Int,Seq[String]]) + extends RDD[T](sc, Nil) { + // TODO: Right now, each split sends along its full data, even if later down the RDD chain it gets + // cached. It might be worthwhile to write the data to a file in the DFS and read it in the split + // instead. + // UPDATE: A parallel collection can be checkpointed to HDFS, which achieves this goal. + + override def getSplits: Array[Split] = { + val slices = ParallelCollectionRDD.slice(data, numSlices).toArray + slices.indices.map(i => new ParallelCollectionSplit(id, i, slices(i))).toArray + } + + override def compute(s: Split, context: TaskContext) = + s.asInstanceOf[ParallelCollectionSplit[T]].iterator + + override def getPreferredLocations(s: Split): Seq[String] = { + locationPrefs.getOrElse(s.index, Nil) + } +} + +private object ParallelCollectionRDD { + /** + * Slice a collection into numSlices sub-collections. One extra thing we do here is to treat Range + * collections specially, encoding the slices as other Ranges to minimize memory cost. This makes + * it efficient to run Spark over RDDs representing large sets of numbers. + */ + def slice[T: ClassManifest](seq: Seq[T], numSlices: Int): Seq[Seq[T]] = { + if (numSlices < 1) { + throw new IllegalArgumentException("Positive number of slices required") + } + seq match { + case r: Range.Inclusive => { + val sign = if (r.step < 0) { + -1 + } else { + 1 + } + slice(new Range( + r.start, r.end + sign, r.step).asInstanceOf[Seq[T]], numSlices) + } + case r: Range => { + (0 until numSlices).map(i => { + val start = ((i * r.length.toLong) / numSlices).toInt + val end = (((i+1) * r.length.toLong) / numSlices).toInt + new Range(r.start + start * r.step, r.start + end * r.step, r.step) + }).asInstanceOf[Seq[Seq[T]]] + } + case nr: NumericRange[_] => { // For ranges of Long, Double, BigInteger, etc + val slices = new ArrayBuffer[Seq[T]](numSlices) + val sliceSize = (nr.size + numSlices - 1) / numSlices // Round up to catch everything + var r = nr + for (i <- 0 until numSlices) { + slices += r.take(sliceSize).asInstanceOf[Seq[T]] + r = r.drop(sliceSize) + } + slices + } + case _ => { + val array = seq.toArray // To prevent O(n^2) operations for List etc + (0 until numSlices).map(i => { + val start = ((i * array.length.toLong) / numSlices).toInt + val end = (((i+1) * array.length.toLong) / numSlices).toInt + array.slice(start, end).toSeq + }) + } + } + } +} diff --git a/core/src/test/scala/spark/ParallelCollectionSplitSuite.scala b/core/src/test/scala/spark/ParallelCollectionSplitSuite.scala deleted file mode 100644 index 450c69bd58..0000000000 --- a/core/src/test/scala/spark/ParallelCollectionSplitSuite.scala +++ /dev/null @@ -1,195 +0,0 @@ -package spark - -import scala.collection.immutable.NumericRange - -import org.scalatest.FunSuite -import org.scalatest.prop.Checkers -import org.scalacheck.Arbitrary._ -import org.scalacheck.Gen -import org.scalacheck.Prop._ - -class ParallelCollectionSplitSuite extends FunSuite with Checkers { - test("one element per slice") { - val data = Array(1, 2, 3) - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices(0).mkString(",") === "1") - assert(slices(1).mkString(",") === "2") - assert(slices(2).mkString(",") === "3") - } - - test("one slice") { - val data = Array(1, 2, 3) - val slices = ParallelCollection.slice(data, 1) - assert(slices.size === 1) - assert(slices(0).mkString(",") === "1,2,3") - } - - test("equal slices") { - val data = Array(1, 2, 3, 4, 5, 6, 7, 8, 9) - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices(0).mkString(",") === "1,2,3") - assert(slices(1).mkString(",") === "4,5,6") - assert(slices(2).mkString(",") === "7,8,9") - } - - test("non-equal slices") { - val data = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices(0).mkString(",") === "1,2,3") - assert(slices(1).mkString(",") === "4,5,6") - assert(slices(2).mkString(",") === "7,8,9,10") - } - - test("splitting exclusive range") { - val data = 0 until 100 - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices(0).mkString(",") === (0 to 32).mkString(",")) - assert(slices(1).mkString(",") === (33 to 65).mkString(",")) - assert(slices(2).mkString(",") === (66 to 99).mkString(",")) - } - - test("splitting inclusive range") { - val data = 0 to 100 - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices(0).mkString(",") === (0 to 32).mkString(",")) - assert(slices(1).mkString(",") === (33 to 66).mkString(",")) - assert(slices(2).mkString(",") === (67 to 100).mkString(",")) - } - - test("empty data") { - val data = new Array[Int](0) - val slices = ParallelCollection.slice(data, 5) - assert(slices.size === 5) - for (slice <- slices) assert(slice.size === 0) - } - - test("zero slices") { - val data = Array(1, 2, 3) - intercept[IllegalArgumentException] { ParallelCollection.slice(data, 0) } - } - - test("negative number of slices") { - val data = Array(1, 2, 3) - intercept[IllegalArgumentException] { ParallelCollection.slice(data, -5) } - } - - test("exclusive ranges sliced into ranges") { - val data = 1 until 100 - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices.map(_.size).reduceLeft(_+_) === 99) - assert(slices.forall(_.isInstanceOf[Range])) - } - - test("inclusive ranges sliced into ranges") { - val data = 1 to 100 - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices.map(_.size).reduceLeft(_+_) === 100) - assert(slices.forall(_.isInstanceOf[Range])) - } - - test("large ranges don't overflow") { - val N = 100 * 1000 * 1000 - val data = 0 until N - val slices = ParallelCollection.slice(data, 40) - assert(slices.size === 40) - for (i <- 0 until 40) { - assert(slices(i).isInstanceOf[Range]) - val range = slices(i).asInstanceOf[Range] - assert(range.start === i * (N / 40), "slice " + i + " start") - assert(range.end === (i+1) * (N / 40), "slice " + i + " end") - assert(range.step === 1, "slice " + i + " step") - } - } - - test("random array tests") { - val gen = for { - d <- arbitrary[List[Int]] - n <- Gen.choose(1, 100) - } yield (d, n) - val prop = forAll(gen) { - (tuple: (List[Int], Int)) => - val d = tuple._1 - val n = tuple._2 - val slices = ParallelCollection.slice(d, n) - ("n slices" |: slices.size == n) && - ("concat to d" |: Seq.concat(slices: _*).mkString(",") == d.mkString(",")) && - ("equal sizes" |: slices.map(_.size).forall(x => x==d.size/n || x==d.size/n+1)) - } - check(prop) - } - - test("random exclusive range tests") { - val gen = for { - a <- Gen.choose(-100, 100) - b <- Gen.choose(-100, 100) - step <- Gen.choose(-5, 5) suchThat (_ != 0) - n <- Gen.choose(1, 100) - } yield (a until b by step, n) - val prop = forAll(gen) { - case (d: Range, n: Int) => - val slices = ParallelCollection.slice(d, n) - ("n slices" |: slices.size == n) && - ("all ranges" |: slices.forall(_.isInstanceOf[Range])) && - ("concat to d" |: Seq.concat(slices: _*).mkString(",") == d.mkString(",")) && - ("equal sizes" |: slices.map(_.size).forall(x => x==d.size/n || x==d.size/n+1)) - } - check(prop) - } - - test("random inclusive range tests") { - val gen = for { - a <- Gen.choose(-100, 100) - b <- Gen.choose(-100, 100) - step <- Gen.choose(-5, 5) suchThat (_ != 0) - n <- Gen.choose(1, 100) - } yield (a to b by step, n) - val prop = forAll(gen) { - case (d: Range, n: Int) => - val slices = ParallelCollection.slice(d, n) - ("n slices" |: slices.size == n) && - ("all ranges" |: slices.forall(_.isInstanceOf[Range])) && - ("concat to d" |: Seq.concat(slices: _*).mkString(",") == d.mkString(",")) && - ("equal sizes" |: slices.map(_.size).forall(x => x==d.size/n || x==d.size/n+1)) - } - check(prop) - } - - test("exclusive ranges of longs") { - val data = 1L until 100L - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices.map(_.size).reduceLeft(_+_) === 99) - assert(slices.forall(_.isInstanceOf[NumericRange[_]])) - } - - test("inclusive ranges of longs") { - val data = 1L to 100L - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices.map(_.size).reduceLeft(_+_) === 100) - assert(slices.forall(_.isInstanceOf[NumericRange[_]])) - } - - test("exclusive ranges of doubles") { - val data = 1.0 until 100.0 by 1.0 - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices.map(_.size).reduceLeft(_+_) === 99) - assert(slices.forall(_.isInstanceOf[NumericRange[_]])) - } - - test("inclusive ranges of doubles") { - val data = 1.0 to 100.0 by 1.0 - val slices = ParallelCollection.slice(data, 3) - assert(slices.size === 3) - assert(slices.map(_.size).reduceLeft(_+_) === 100) - assert(slices.forall(_.isInstanceOf[NumericRange[_]])) - } -} diff --git a/core/src/test/scala/spark/rdd/ParallelCollectionSplitSuite.scala b/core/src/test/scala/spark/rdd/ParallelCollectionSplitSuite.scala new file mode 100644 index 0000000000..d27a2538e4 --- /dev/null +++ b/core/src/test/scala/spark/rdd/ParallelCollectionSplitSuite.scala @@ -0,0 +1,195 @@ +package spark.rdd + +import scala.collection.immutable.NumericRange + +import org.scalatest.FunSuite +import org.scalatest.prop.Checkers +import org.scalacheck.Arbitrary._ +import org.scalacheck.Gen +import org.scalacheck.Prop._ + +class ParallelCollectionSplitSuite extends FunSuite with Checkers { + test("one element per slice") { + val data = Array(1, 2, 3) + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices(0).mkString(",") === "1") + assert(slices(1).mkString(",") === "2") + assert(slices(2).mkString(",") === "3") + } + + test("one slice") { + val data = Array(1, 2, 3) + val slices = ParallelCollectionRDD.slice(data, 1) + assert(slices.size === 1) + assert(slices(0).mkString(",") === "1,2,3") + } + + test("equal slices") { + val data = Array(1, 2, 3, 4, 5, 6, 7, 8, 9) + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices(0).mkString(",") === "1,2,3") + assert(slices(1).mkString(",") === "4,5,6") + assert(slices(2).mkString(",") === "7,8,9") + } + + test("non-equal slices") { + val data = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices(0).mkString(",") === "1,2,3") + assert(slices(1).mkString(",") === "4,5,6") + assert(slices(2).mkString(",") === "7,8,9,10") + } + + test("splitting exclusive range") { + val data = 0 until 100 + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices(0).mkString(",") === (0 to 32).mkString(",")) + assert(slices(1).mkString(",") === (33 to 65).mkString(",")) + assert(slices(2).mkString(",") === (66 to 99).mkString(",")) + } + + test("splitting inclusive range") { + val data = 0 to 100 + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices(0).mkString(",") === (0 to 32).mkString(",")) + assert(slices(1).mkString(",") === (33 to 66).mkString(",")) + assert(slices(2).mkString(",") === (67 to 100).mkString(",")) + } + + test("empty data") { + val data = new Array[Int](0) + val slices = ParallelCollectionRDD.slice(data, 5) + assert(slices.size === 5) + for (slice <- slices) assert(slice.size === 0) + } + + test("zero slices") { + val data = Array(1, 2, 3) + intercept[IllegalArgumentException] { ParallelCollectionRDD.slice(data, 0) } + } + + test("negative number of slices") { + val data = Array(1, 2, 3) + intercept[IllegalArgumentException] { ParallelCollectionRDD.slice(data, -5) } + } + + test("exclusive ranges sliced into ranges") { + val data = 1 until 100 + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices.map(_.size).reduceLeft(_+_) === 99) + assert(slices.forall(_.isInstanceOf[Range])) + } + + test("inclusive ranges sliced into ranges") { + val data = 1 to 100 + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices.map(_.size).reduceLeft(_+_) === 100) + assert(slices.forall(_.isInstanceOf[Range])) + } + + test("large ranges don't overflow") { + val N = 100 * 1000 * 1000 + val data = 0 until N + val slices = ParallelCollectionRDD.slice(data, 40) + assert(slices.size === 40) + for (i <- 0 until 40) { + assert(slices(i).isInstanceOf[Range]) + val range = slices(i).asInstanceOf[Range] + assert(range.start === i * (N / 40), "slice " + i + " start") + assert(range.end === (i+1) * (N / 40), "slice " + i + " end") + assert(range.step === 1, "slice " + i + " step") + } + } + + test("random array tests") { + val gen = for { + d <- arbitrary[List[Int]] + n <- Gen.choose(1, 100) + } yield (d, n) + val prop = forAll(gen) { + (tuple: (List[Int], Int)) => + val d = tuple._1 + val n = tuple._2 + val slices = ParallelCollectionRDD.slice(d, n) + ("n slices" |: slices.size == n) && + ("concat to d" |: Seq.concat(slices: _*).mkString(",") == d.mkString(",")) && + ("equal sizes" |: slices.map(_.size).forall(x => x==d.size/n || x==d.size/n+1)) + } + check(prop) + } + + test("random exclusive range tests") { + val gen = for { + a <- Gen.choose(-100, 100) + b <- Gen.choose(-100, 100) + step <- Gen.choose(-5, 5) suchThat (_ != 0) + n <- Gen.choose(1, 100) + } yield (a until b by step, n) + val prop = forAll(gen) { + case (d: Range, n: Int) => + val slices = ParallelCollectionRDD.slice(d, n) + ("n slices" |: slices.size == n) && + ("all ranges" |: slices.forall(_.isInstanceOf[Range])) && + ("concat to d" |: Seq.concat(slices: _*).mkString(",") == d.mkString(",")) && + ("equal sizes" |: slices.map(_.size).forall(x => x==d.size/n || x==d.size/n+1)) + } + check(prop) + } + + test("random inclusive range tests") { + val gen = for { + a <- Gen.choose(-100, 100) + b <- Gen.choose(-100, 100) + step <- Gen.choose(-5, 5) suchThat (_ != 0) + n <- Gen.choose(1, 100) + } yield (a to b by step, n) + val prop = forAll(gen) { + case (d: Range, n: Int) => + val slices = ParallelCollectionRDD.slice(d, n) + ("n slices" |: slices.size == n) && + ("all ranges" |: slices.forall(_.isInstanceOf[Range])) && + ("concat to d" |: Seq.concat(slices: _*).mkString(",") == d.mkString(",")) && + ("equal sizes" |: slices.map(_.size).forall(x => x==d.size/n || x==d.size/n+1)) + } + check(prop) + } + + test("exclusive ranges of longs") { + val data = 1L until 100L + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices.map(_.size).reduceLeft(_+_) === 99) + assert(slices.forall(_.isInstanceOf[NumericRange[_]])) + } + + test("inclusive ranges of longs") { + val data = 1L to 100L + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices.map(_.size).reduceLeft(_+_) === 100) + assert(slices.forall(_.isInstanceOf[NumericRange[_]])) + } + + test("exclusive ranges of doubles") { + val data = 1.0 until 100.0 by 1.0 + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices.map(_.size).reduceLeft(_+_) === 99) + assert(slices.forall(_.isInstanceOf[NumericRange[_]])) + } + + test("inclusive ranges of doubles") { + val data = 1.0 to 100.0 by 1.0 + val slices = ParallelCollectionRDD.slice(data, 3) + assert(slices.size === 3) + assert(slices.map(_.size).reduceLeft(_+_) === 100) + assert(slices.forall(_.isInstanceOf[NumericRange[_]])) + } +} -- cgit v1.2.3 From 08e444df0e8afec1bcae45ad1edcaff1fc21b39e Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 17 Feb 2013 14:01:48 -0800 Subject: Change EC2 script to use 0.6 AMIs by default, for now --- ec2/spark_ec2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index ce1072fd39..cb8f78db8d 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -87,8 +87,8 @@ def parse_args(): parser.add_option("-g", "--ganglia", action="store_true", default=True, help="Setup ganglia monitoring for the cluster. NOTE: The ganglia " + "monitoring page will be publicly accessible") - parser.add_option("--mesos-scripts", action="store_true", default=False, - help="Use older mesos-ec2 scripts to setup the cluster. NOTE: Ganglia " + + parser.add_option("--new-scripts", action="store_true", default=False, + help="Use new spark-ec2 scripts to setup the cluster. NOTE: Ganglia " + "will not be setup with this option") parser.add_option("-u", "--user", default="root", help="The ssh user you want to connect as (default: root)") @@ -380,17 +380,17 @@ def setup_cluster(conn, master_nodes, slave_nodes, zoo_nodes, opts, deploy_ssh_k if opts.ganglia: modules.append('ganglia') - if not opts.mesos_scripts: + if opts.new_scripts: # NOTE: We should clone the repository before running deploy_files to # prevent ec2-variables.sh from being overwritten - ssh(master, opts, "rm -rf spark-ec2 && git clone https://github.com/shivaram/spark-ec2.git") + ssh(master, opts, "rm -rf spark-ec2 && git clone https://github.com/mesos/spark-ec2.git") print "Deploying files to master..." deploy_files(conn, "deploy.generic", opts, master_nodes, slave_nodes, zoo_nodes, modules) print "Running setup on master..." - if opts.mesos_scripts: + if not opts.new_scripts: if opts.cluster_type == "mesos": setup_mesos_cluster(master, opts) elif opts.cluster_type == "standalone": -- cgit v1.2.3 From 455d015076ab1fcafa99484c8dcf7cc9d740686a Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 17 Feb 2013 16:53:12 -0800 Subject: Clean up EC2 script options a bit --- ec2/spark_ec2.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index cb8f78db8d..7967bcac50 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -82,18 +82,21 @@ def parse_args(): parser.add_option("--spot-price", metavar="PRICE", type="float", help="If specified, launch slaves as spot instances with the given " + "maximum price (in dollars)") - parser.add_option("-c", "--cluster-type", default="mesos", - help="'mesos' for a mesos cluster, 'standalone' for a standalone spark cluster (default: mesos)") - parser.add_option("-g", "--ganglia", action="store_true", default=True, - help="Setup ganglia monitoring for the cluster. NOTE: The ganglia " + - "monitoring page will be publicly accessible") + parser.add_option("--cluster-type", type="choice", metavar="TYPE", + choices=["mesos", "standalone"], default="mesos", + help="'mesos' for a Mesos cluster, 'standalone' for a standalone " + + "Spark cluster (default: mesos)") + parser.add_option("--ganglia", action="store_true", default=True, + help="Setup Ganglia monitoring on cluster (default: on). NOTE: " + + "the Ganglia page will be publicly accessible") + parser.add_option("--no-ganglia", action="store_false", dest="ganglia", + help="Disable Ganglia monitoring for the cluster") parser.add_option("--new-scripts", action="store_true", default=False, - help="Use new spark-ec2 scripts to setup the cluster. NOTE: Ganglia " + - "will not be setup with this option") + help="Use new spark-ec2 scripts, for Spark >= 0.7 AMIs") parser.add_option("-u", "--user", default="root", - help="The ssh user you want to connect as (default: root)") + help="The SSH user you want to connect as (default: root)") parser.add_option("--delete-groups", action="store_true", default=False, - help="When destroying a cluster, also destroy the security groups that were created") + help="When destroying a cluster, delete the security groups that were created") (opts, args) = parser.parse_args() if len(args) != 2: -- cgit v1.2.3 From 06e5e6627f3856b5c6e3e60cbb167044de9ef6d4 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 17 Feb 2013 22:13:26 -0800 Subject: Renamed "splits" to "partitions" --- bagel/src/main/scala/spark/bagel/Bagel.scala | 20 ++-- .../spark/bagel/examples/WikipediaPageRank.scala | 6 +- .../examples/WikipediaPageRankStandalone.scala | 2 +- core/src/main/scala/spark/CacheManager.scala | 4 +- core/src/main/scala/spark/DoubleRDDFunctions.scala | 4 +- core/src/main/scala/spark/PairRDDFunctions.scala | 50 +++++----- core/src/main/scala/spark/Partition.scala | 14 +++ core/src/main/scala/spark/RDD.scala | 76 +++++++------- core/src/main/scala/spark/RDDCheckpointData.scala | 12 +-- core/src/main/scala/spark/SparkContext.scala | 10 +- core/src/main/scala/spark/Split.scala | 14 --- .../main/scala/spark/api/java/JavaDoubleRDD.scala | 6 +- .../main/scala/spark/api/java/JavaPairRDD.scala | 44 ++++----- core/src/main/scala/spark/api/java/JavaRDD.scala | 6 +- .../main/scala/spark/api/java/JavaRDDLike.scala | 12 +-- .../main/scala/spark/api/python/PythonRDD.scala | 10 +- .../spark/partial/ApproximateActionListener.scala | 2 +- core/src/main/scala/spark/rdd/BlockRDD.scala | 16 +-- core/src/main/scala/spark/rdd/CartesianRDD.scala | 36 +++---- core/src/main/scala/spark/rdd/CheckpointRDD.scala | 20 ++-- core/src/main/scala/spark/rdd/CoGroupedRDD.scala | 22 ++--- core/src/main/scala/spark/rdd/CoalescedRDD.scala | 24 ++--- core/src/main/scala/spark/rdd/FilteredRDD.scala | 6 +- core/src/main/scala/spark/rdd/FlatMappedRDD.scala | 6 +- core/src/main/scala/spark/rdd/GlommedRDD.scala | 6 +- core/src/main/scala/spark/rdd/HadoopRDD.scala | 20 ++-- .../main/scala/spark/rdd/MapPartitionsRDD.scala | 8 +- .../spark/rdd/MapPartitionsWithIndexRDD.scala | 24 +++++ .../spark/rdd/MapPartitionsWithSplitRDD.scala | 24 ----- core/src/main/scala/spark/rdd/MappedRDD.scala | 6 +- core/src/main/scala/spark/rdd/NewHadoopRDD.scala | 20 ++-- .../scala/spark/rdd/ParallelCollectionRDD.scala | 18 ++-- .../main/scala/spark/rdd/PartitionPruningRDD.scala | 16 +-- core/src/main/scala/spark/rdd/PipedRDD.scala | 6 +- core/src/main/scala/spark/rdd/SampledRDD.scala | 16 +-- core/src/main/scala/spark/rdd/ShuffledRDD.scala | 10 +- core/src/main/scala/spark/rdd/UnionRDD.scala | 30 +++--- core/src/main/scala/spark/rdd/ZippedRDD.scala | 32 +++--- .../main/scala/spark/scheduler/DAGScheduler.scala | 14 +-- .../main/scala/spark/scheduler/ResultTask.scala | 6 +- .../scala/spark/scheduler/ShuffleMapTask.scala | 6 +- core/src/main/scala/spark/scheduler/Stage.scala | 2 +- .../main/scala/spark/storage/BlockManager.scala | 2 +- .../main/scala/spark/storage/StorageUtils.scala | 2 +- core/src/test/scala/spark/CheckpointSuite.scala | 110 ++++++++++----------- core/src/test/scala/spark/RDDSuite.scala | 13 ++- core/src/test/scala/spark/ShuffleSuite.scala | 2 +- core/src/test/scala/spark/SortingSuite.scala | 10 +- .../scala/spark/scheduler/DAGSchedulerSuite.scala | 22 ++--- .../scala/spark/scheduler/TaskContextSuite.scala | 10 +- 50 files changed, 436 insertions(+), 421 deletions(-) create mode 100644 core/src/main/scala/spark/Partition.scala delete mode 100644 core/src/main/scala/spark/Split.scala create mode 100644 core/src/main/scala/spark/rdd/MapPartitionsWithIndexRDD.scala delete mode 100644 core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala diff --git a/bagel/src/main/scala/spark/bagel/Bagel.scala b/bagel/src/main/scala/spark/bagel/Bagel.scala index fa0ba4a573..094e57dacb 100644 --- a/bagel/src/main/scala/spark/bagel/Bagel.scala +++ b/bagel/src/main/scala/spark/bagel/Bagel.scala @@ -14,11 +14,11 @@ object Bagel extends Logging { combiner: Combiner[M, C], aggregator: Option[Aggregator[V, A]], partitioner: Partitioner, - numSplits: Int + numPartitions: Int )( compute: (V, Option[C], Option[A], Int) => (V, Array[M]) ): RDD[(K, V)] = { - val splits = if (numSplits != 0) numSplits else sc.defaultParallelism + val splits = if (numPartitions != 0) numPartitions else sc.defaultParallelism var superstep = 0 var verts = vertices @@ -56,12 +56,12 @@ object Bagel extends Logging { messages: RDD[(K, M)], combiner: Combiner[M, C], partitioner: Partitioner, - numSplits: Int + numPartitions: Int )( compute: (V, Option[C], Int) => (V, Array[M]) ): RDD[(K, V)] = { run[K, V, M, C, Nothing]( - sc, vertices, messages, combiner, None, partitioner, numSplits)( + sc, vertices, messages, combiner, None, partitioner, numPartitions)( addAggregatorArg[K, V, M, C](compute)) } @@ -70,13 +70,13 @@ object Bagel extends Logging { vertices: RDD[(K, V)], messages: RDD[(K, M)], combiner: Combiner[M, C], - numSplits: Int + numPartitions: Int )( compute: (V, Option[C], Int) => (V, Array[M]) ): RDD[(K, V)] = { - val part = new HashPartitioner(numSplits) + val part = new HashPartitioner(numPartitions) run[K, V, M, C, Nothing]( - sc, vertices, messages, combiner, None, part, numSplits)( + sc, vertices, messages, combiner, None, part, numPartitions)( addAggregatorArg[K, V, M, C](compute)) } @@ -84,13 +84,13 @@ object Bagel extends Logging { sc: SparkContext, vertices: RDD[(K, V)], messages: RDD[(K, M)], - numSplits: Int + numPartitions: Int )( compute: (V, Option[Array[M]], Int) => (V, Array[M]) ): RDD[(K, V)] = { - val part = new HashPartitioner(numSplits) + val part = new HashPartitioner(numPartitions) run[K, V, M, Array[M], Nothing]( - sc, vertices, messages, new DefaultCombiner(), None, part, numSplits)( + sc, vertices, messages, new DefaultCombiner(), None, part, numPartitions)( addAggregatorArg[K, V, M, Array[M]](compute)) } diff --git a/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRank.scala b/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRank.scala index 03843019c0..bc32663e0f 100644 --- a/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRank.scala +++ b/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRank.scala @@ -16,7 +16,7 @@ import scala.xml.{XML,NodeSeq} object WikipediaPageRank { def main(args: Array[String]) { if (args.length < 5) { - System.err.println("Usage: WikipediaPageRank ") + System.err.println("Usage: WikipediaPageRank ") System.exit(-1) } @@ -25,7 +25,7 @@ object WikipediaPageRank { val inputFile = args(0) val threshold = args(1).toDouble - val numSplits = args(2).toInt + val numPartitions = args(2).toInt val host = args(3) val usePartitioner = args(4).toBoolean val sc = new SparkContext(host, "WikipediaPageRank") @@ -69,7 +69,7 @@ object WikipediaPageRank { val result = Bagel.run( sc, vertices, messages, combiner = new PRCombiner(), - numSplits = numSplits)( + numPartitions = numPartitions)( utils.computeWithCombiner(numVertices, epsilon)) // Print the result diff --git a/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala b/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala index 06cc8c748b..9d9d80d809 100644 --- a/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala +++ b/bagel/src/main/scala/spark/bagel/examples/WikipediaPageRankStandalone.scala @@ -88,7 +88,7 @@ object WikipediaPageRankStandalone { n: Long, partitioner: Partitioner, usePartitioner: Boolean, - numSplits: Int + numPartitions: Int ): RDD[(String, Double)] = { var ranks = links.mapValues { edges => defaultRank } for (i <- 1 to numIterations) { diff --git a/core/src/main/scala/spark/CacheManager.scala b/core/src/main/scala/spark/CacheManager.scala index 711435c333..c7b379a3fb 100644 --- a/core/src/main/scala/spark/CacheManager.scala +++ b/core/src/main/scala/spark/CacheManager.scala @@ -11,13 +11,13 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging { private val loading = new HashSet[String] /** Gets or computes an RDD split. Used by RDD.iterator() when an RDD is cached. */ - def getOrCompute[T](rdd: RDD[T], split: Split, context: TaskContext, storageLevel: StorageLevel) + def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext, storageLevel: StorageLevel) : Iterator[T] = { val key = "rdd_%d_%d".format(rdd.id, split.index) logInfo("Cache key is " + key) blockManager.get(key) match { case Some(cachedValues) => - // Split is in cache, so just return its values + // Partition is in cache, so just return its values logInfo("Found partition in cache!") return cachedValues.asInstanceOf[Iterator[T]] diff --git a/core/src/main/scala/spark/DoubleRDDFunctions.scala b/core/src/main/scala/spark/DoubleRDDFunctions.scala index b2a0e2b631..178d31a73b 100644 --- a/core/src/main/scala/spark/DoubleRDDFunctions.scala +++ b/core/src/main/scala/spark/DoubleRDDFunctions.scala @@ -42,14 +42,14 @@ class DoubleRDDFunctions(self: RDD[Double]) extends Logging with Serializable { /** (Experimental) Approximate operation to return the mean within a timeout. */ def meanApprox(timeout: Long, confidence: Double = 0.95): PartialResult[BoundedDouble] = { val processPartition = (ctx: TaskContext, ns: Iterator[Double]) => StatCounter(ns) - val evaluator = new MeanEvaluator(self.splits.size, confidence) + val evaluator = new MeanEvaluator(self.partitions.size, confidence) self.context.runApproximateJob(self, processPartition, evaluator, timeout) } /** (Experimental) Approximate operation to return the sum within a timeout. */ def sumApprox(timeout: Long, confidence: Double = 0.95): PartialResult[BoundedDouble] = { val processPartition = (ctx: TaskContext, ns: Iterator[Double]) => StatCounter(ns) - val evaluator = new SumEvaluator(self.splits.size, confidence) + val evaluator = new SumEvaluator(self.partitions.size, confidence) self.context.runApproximateJob(self, processPartition, evaluator, timeout) } } diff --git a/core/src/main/scala/spark/PairRDDFunctions.scala b/core/src/main/scala/spark/PairRDDFunctions.scala index 019be11ea8..4319cbd892 100644 --- a/core/src/main/scala/spark/PairRDDFunctions.scala +++ b/core/src/main/scala/spark/PairRDDFunctions.scala @@ -83,8 +83,8 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( def combineByKey[C](createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, - numSplits: Int): RDD[(K, C)] = { - combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numSplits)) + numPartitions: Int): RDD[(K, C)] = { + combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numPartitions)) } /** @@ -145,10 +145,10 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( /** * Merge the values for each key using an associative reduce function. This will also perform * the merging locally on each mapper before sending results to a reducer, similarly to a - * "combiner" in MapReduce. Output will be hash-partitioned with numSplits splits. + * "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions. */ - def reduceByKey(func: (V, V) => V, numSplits: Int): RDD[(K, V)] = { - reduceByKey(new HashPartitioner(numSplits), func) + def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] = { + reduceByKey(new HashPartitioner(numPartitions), func) } /** @@ -166,10 +166,10 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( /** * Group the values for each key in the RDD into a single sequence. Hash-partitions the - * resulting RDD with into `numSplits` partitions. + * resulting RDD with into `numPartitions` partitions. */ - def groupByKey(numSplits: Int): RDD[(K, Seq[V])] = { - groupByKey(new HashPartitioner(numSplits)) + def groupByKey(numPartitions: Int): RDD[(K, Seq[V])] = { + groupByKey(new HashPartitioner(numPartitions)) } /** @@ -287,8 +287,8 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( * pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in `this` and * (k, v2) is in `other`. Performs a hash join across the cluster. */ - def join[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (V, W))] = { - join(other, new HashPartitioner(numSplits)) + def join[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, W))] = { + join(other, new HashPartitioner(numPartitions)) } /** @@ -305,10 +305,10 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the * resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output - * into `numSplits` partitions. + * into `numPartitions` partitions. */ - def leftOuterJoin[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (V, Option[W]))] = { - leftOuterJoin(other, new HashPartitioner(numSplits)) + def leftOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, Option[W]))] = { + leftOuterJoin(other, new HashPartitioner(numPartitions)) } /** @@ -327,8 +327,8 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( * pair (k, (None, w)) if no elements in `this` have key k. Hash-partitions the resulting * RDD into the given number of partitions. */ - def rightOuterJoin[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (Option[V], W))] = { - rightOuterJoin(other, new HashPartitioner(numSplits)) + def rightOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], W))] = { + rightOuterJoin(other, new HashPartitioner(numPartitions)) } /** @@ -414,17 +414,17 @@ class PairRDDFunctions[K: ClassManifest, V: ClassManifest]( * For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the * list of values for that key in `this` as well as `other`. */ - def cogroup[W](other: RDD[(K, W)], numSplits: Int): RDD[(K, (Seq[V], Seq[W]))] = { - cogroup(other, new HashPartitioner(numSplits)) + def cogroup[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Seq[V], Seq[W]))] = { + cogroup(other, new HashPartitioner(numPartitions)) } /** * For each key k in `this` or `other1` or `other2`, return a resulting RDD that contains a * tuple with the list of values for that key in `this`, `other1` and `other2`. */ - def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numSplits: Int) + def cogroup[W1, W2](other1: RDD[(K, W1)], other2: RDD[(K, W2)], numPartitions: Int) : RDD[(K, (Seq[V], Seq[W1], Seq[W2]))] = { - cogroup(other1, other2, new HashPartitioner(numSplits)) + cogroup(other1, other2, new HashPartitioner(numPartitions)) } /** Alias for cogroup. */ @@ -636,9 +636,9 @@ class OrderedRDDFunctions[K <% Ordered[K]: ClassManifest, V: ClassManifest]( * (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in * order of the keys). */ - def sortByKey(ascending: Boolean = true, numSplits: Int = self.splits.size): RDD[(K,V)] = { + def sortByKey(ascending: Boolean = true, numPartitions: Int = self.partitions.size): RDD[(K,V)] = { val shuffled = - new ShuffledRDD[K, V](self, new RangePartitioner(numSplits, self, ascending)) + new ShuffledRDD[K, V](self, new RangePartitioner(numPartitions, self, ascending)) shuffled.mapPartitions(iter => { val buf = iter.toArray if (ascending) { @@ -652,9 +652,9 @@ class OrderedRDDFunctions[K <% Ordered[K]: ClassManifest, V: ClassManifest]( private[spark] class MappedValuesRDD[K, V, U](prev: RDD[(K, V)], f: V => U) extends RDD[(K, U)](prev) { - override def getSplits = firstParent[(K, V)].splits + override def getPartitions = firstParent[(K, V)].partitions override val partitioner = firstParent[(K, V)].partitioner - override def compute(split: Split, context: TaskContext) = + override def compute(split: Partition, context: TaskContext) = firstParent[(K, V)].iterator(split, context).map{ case (k, v) => (k, f(v)) } } @@ -662,9 +662,9 @@ private[spark] class FlatMappedValuesRDD[K, V, U](prev: RDD[(K, V)], f: V => TraversableOnce[U]) extends RDD[(K, U)](prev) { - override def getSplits = firstParent[(K, V)].splits + override def getPartitions = firstParent[(K, V)].partitions override val partitioner = firstParent[(K, V)].partitioner - override def compute(split: Split, context: TaskContext) = { + override def compute(split: Partition, context: TaskContext) = { firstParent[(K, V)].iterator(split, context).flatMap { case (k, v) => f(v).map(x => (k, x)) } } } diff --git a/core/src/main/scala/spark/Partition.scala b/core/src/main/scala/spark/Partition.scala new file mode 100644 index 0000000000..e384308ef6 --- /dev/null +++ b/core/src/main/scala/spark/Partition.scala @@ -0,0 +1,14 @@ +package spark + +/** + * A partition of an RDD. + */ +trait Partition extends Serializable { + /** + * Get the split's index within its parent RDD + */ + def index: Int + + // A better default implementation of HashCode + override def hashCode(): Int = index +} diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala index f6e927a989..da82dfd10f 100644 --- a/core/src/main/scala/spark/RDD.scala +++ b/core/src/main/scala/spark/RDD.scala @@ -27,7 +27,7 @@ import spark.rdd.FlatMappedRDD import spark.rdd.GlommedRDD import spark.rdd.MappedRDD import spark.rdd.MapPartitionsRDD -import spark.rdd.MapPartitionsWithSplitRDD +import spark.rdd.MapPartitionsWithIndexRDD import spark.rdd.PipedRDD import spark.rdd.SampledRDD import spark.rdd.UnionRDD @@ -49,7 +49,7 @@ import SparkContext._ * * Internally, each RDD is characterized by five main properties: * - * - A list of splits (partitions) + * - A list of partitions * - A function for computing each split * - A list of dependencies on other RDDs * - Optionally, a Partitioner for key-value RDDs (e.g. to say that the RDD is hash-partitioned) @@ -76,13 +76,13 @@ abstract class RDD[T: ClassManifest]( // ======================================================================= /** Implemented by subclasses to compute a given partition. */ - def compute(split: Split, context: TaskContext): Iterator[T] + def compute(split: Partition, context: TaskContext): Iterator[T] /** * Implemented by subclasses to return the set of partitions in this RDD. This method will only * be called once, so it is safe to implement a time-consuming computation in it. */ - protected def getSplits: Array[Split] + protected def getPartitions: Array[Partition] /** * Implemented by subclasses to return how this RDD depends on parent RDDs. This method will only @@ -91,7 +91,7 @@ abstract class RDD[T: ClassManifest]( protected def getDependencies: Seq[Dependency[_]] = deps /** Optionally overridden by subclasses to specify placement preferences. */ - protected def getPreferredLocations(split: Split): Seq[String] = Nil + protected def getPreferredLocations(split: Partition): Seq[String] = Nil /** Optionally overridden by subclasses to specify how they are partitioned. */ val partitioner: Option[Partitioner] = None @@ -137,10 +137,10 @@ abstract class RDD[T: ClassManifest]( /** Get the RDD's current storage level, or StorageLevel.NONE if none is set. */ def getStorageLevel = storageLevel - // Our dependencies and splits will be gotten by calling subclass's methods below, and will + // Our dependencies and partitions will be gotten by calling subclass's methods below, and will // be overwritten when we're checkpointed private var dependencies_ : Seq[Dependency[_]] = null - @transient private var splits_ : Array[Split] = null + @transient private var partitions_ : Array[Partition] = null /** An Option holding our checkpoint RDD, if we are checkpointed */ private def checkpointRDD: Option[RDD[T]] = checkpointData.flatMap(_.checkpointRDD) @@ -159,15 +159,15 @@ abstract class RDD[T: ClassManifest]( } /** - * Get the array of splits of this RDD, taking into account whether the + * Get the array of partitions of this RDD, taking into account whether the * RDD is checkpointed or not. */ - final def splits: Array[Split] = { - checkpointRDD.map(_.splits).getOrElse { - if (splits_ == null) { - splits_ = getSplits + final def partitions: Array[Partition] = { + checkpointRDD.map(_.partitions).getOrElse { + if (partitions_ == null) { + partitions_ = getPartitions } - splits_ + partitions_ } } @@ -175,7 +175,7 @@ abstract class RDD[T: ClassManifest]( * Get the preferred location of a split, taking into account whether the * RDD is checkpointed or not. */ - final def preferredLocations(split: Split): Seq[String] = { + final def preferredLocations(split: Partition): Seq[String] = { checkpointRDD.map(_.getPreferredLocations(split)).getOrElse { getPreferredLocations(split) } @@ -186,7 +186,7 @@ abstract class RDD[T: ClassManifest]( * This should ''not'' be called by users directly, but is available for implementors of custom * subclasses of RDD. */ - final def iterator(split: Split, context: TaskContext): Iterator[T] = { + final def iterator(split: Partition, context: TaskContext): Iterator[T] = { if (storageLevel != StorageLevel.NONE) { SparkEnv.get.cacheManager.getOrCompute(this, split, context, storageLevel) } else { @@ -197,7 +197,7 @@ abstract class RDD[T: ClassManifest]( /** * Compute an RDD partition or read it from a checkpoint if the RDD is checkpointing. */ - private[spark] def computeOrReadCheckpoint(split: Split, context: TaskContext): Iterator[T] = { + private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] = { if (isCheckpointed) { firstParent[T].iterator(split, context) } else { @@ -227,15 +227,15 @@ abstract class RDD[T: ClassManifest]( /** * Return a new RDD containing the distinct elements in this RDD. */ - def distinct(numSplits: Int): RDD[T] = - map(x => (x, null)).reduceByKey((x, y) => x, numSplits).map(_._1) + def distinct(numPartitions: Int): RDD[T] = + map(x => (x, null)).reduceByKey((x, y) => x, numPartitions).map(_._1) - def distinct(): RDD[T] = distinct(splits.size) + def distinct(): RDD[T] = distinct(partitions.size) /** - * Return a new RDD that is reduced into `numSplits` partitions. + * Return a new RDD that is reduced into `numPartitions` partitions. */ - def coalesce(numSplits: Int): RDD[T] = new CoalescedRDD(this, numSplits) + def coalesce(numPartitions: Int): RDD[T] = new CoalescedRDD(this, numPartitions) /** * Return a sampled subset of this RDD. @@ -303,9 +303,9 @@ abstract class RDD[T: ClassManifest]( * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. */ - def groupBy[K: ClassManifest](f: T => K, numSplits: Int): RDD[(K, Seq[T])] = { + def groupBy[K: ClassManifest](f: T => K, numPartitions: Int): RDD[(K, Seq[T])] = { val cleanF = sc.clean(f) - this.map(t => (cleanF(t), t)).groupByKey(numSplits) + this.map(t => (cleanF(t), t)).groupByKey(numPartitions) } /** @@ -336,14 +336,24 @@ abstract class RDD[T: ClassManifest]( preservesPartitioning: Boolean = false): RDD[U] = new MapPartitionsRDD(this, sc.clean(f), preservesPartitioning) - /** + /** + * Return a new RDD by applying a function to each partition of this RDD, while tracking the index + * of the original partition. + */ + def mapPartitionsWithIndex[U: ClassManifest]( + f: (Int, Iterator[T]) => Iterator[U], + preservesPartitioning: Boolean = false): RDD[U] = + new MapPartitionsWithIndexRDD(this, sc.clean(f), preservesPartitioning) + + /** * Return a new RDD by applying a function to each partition of this RDD, while tracking the index * of the original partition. */ + @deprecated("use mapPartitionsWithIndex") def mapPartitionsWithSplit[U: ClassManifest]( f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = - new MapPartitionsWithSplitRDD(this, sc.clean(f), preservesPartitioning) + new MapPartitionsWithIndexRDD(this, sc.clean(f), preservesPartitioning) /** * Zips this RDD with another one, returning key-value pairs with the first element in each RDD, @@ -471,7 +481,7 @@ abstract class RDD[T: ClassManifest]( } result } - val evaluator = new CountEvaluator(splits.size, confidence) + val evaluator = new CountEvaluator(partitions.size, confidence) sc.runApproximateJob(this, countElements, evaluator, timeout) } @@ -522,7 +532,7 @@ abstract class RDD[T: ClassManifest]( } map } - val evaluator = new GroupedCountEvaluator[T](splits.size, confidence) + val evaluator = new GroupedCountEvaluator[T](partitions.size, confidence) sc.runApproximateJob(this, countPartition, evaluator, timeout) } @@ -537,7 +547,7 @@ abstract class RDD[T: ClassManifest]( } val buf = new ArrayBuffer[T] var p = 0 - while (buf.size < num && p < splits.size) { + while (buf.size < num && p < partitions.size) { val left = num - buf.size val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, Array(p), true) buf ++= res(0) @@ -657,11 +667,11 @@ abstract class RDD[T: ClassManifest]( /** * Changes the dependencies of this RDD from its original parents to a new RDD (`newRDD`) - * created from the checkpoint file, and forget its old dependencies and splits. + * created from the checkpoint file, and forget its old dependencies and partitions. */ private[spark] def markCheckpointed(checkpointRDD: RDD[_]) { clearDependencies() - splits_ = null + partitions_ = null deps = null // Forget the constructor argument for dependencies too } @@ -676,15 +686,15 @@ abstract class RDD[T: ClassManifest]( } /** A description of this RDD and its recursive dependencies for debugging. */ - def toDebugString(): String = { + def toDebugString: String = { def debugString(rdd: RDD[_], prefix: String = ""): Seq[String] = { - Seq(prefix + rdd + " (" + rdd.splits.size + " splits)") ++ + Seq(prefix + rdd + " (" + rdd.partitions.size + " partitions)") ++ rdd.dependencies.flatMap(d => debugString(d.rdd, prefix + " ")) } debugString(this).mkString("\n") } - override def toString(): String = "%s%s[%d] at %s".format( + override def toString: String = "%s%s[%d] at %s".format( Option(name).map(_ + " ").getOrElse(""), getClass.getSimpleName, id, diff --git a/core/src/main/scala/spark/RDDCheckpointData.scala b/core/src/main/scala/spark/RDDCheckpointData.scala index a4a4ebaf53..d00092e984 100644 --- a/core/src/main/scala/spark/RDDCheckpointData.scala +++ b/core/src/main/scala/spark/RDDCheckpointData.scala @@ -16,7 +16,7 @@ private[spark] object CheckpointState extends Enumeration { /** * This class contains all the information related to RDD checkpointing. Each instance of this class * is associated with a RDD. It manages process of checkpointing of the associated RDD, as well as, - * manages the post-checkpoint state by providing the updated splits, iterator and preferred locations + * manages the post-checkpoint state by providing the updated partitions, iterator and preferred locations * of the checkpointed RDD. */ private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T]) @@ -67,11 +67,11 @@ private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T]) rdd.context.runJob(rdd, CheckpointRDD.writeToFile(path) _) val newRDD = new CheckpointRDD[T](rdd.context, path) - // Change the dependencies and splits of the RDD + // Change the dependencies and partitions of the RDD RDDCheckpointData.synchronized { cpFile = Some(path) cpRDD = Some(newRDD) - rdd.markCheckpointed(newRDD) // Update the RDD's dependencies and splits + rdd.markCheckpointed(newRDD) // Update the RDD's dependencies and partitions cpState = Checkpointed RDDCheckpointData.clearTaskCaches() logInfo("Done checkpointing RDD " + rdd.id + ", new parent is RDD " + newRDD.id) @@ -79,15 +79,15 @@ private[spark] class RDDCheckpointData[T: ClassManifest](rdd: RDD[T]) } // Get preferred location of a split after checkpointing - def getPreferredLocations(split: Split): Seq[String] = { + def getPreferredLocations(split: Partition): Seq[String] = { RDDCheckpointData.synchronized { cpRDD.get.preferredLocations(split) } } - def getSplits: Array[Split] = { + def getPartitions: Array[Partition] = { RDDCheckpointData.synchronized { - cpRDD.get.splits + cpRDD.get.partitions } } diff --git a/core/src/main/scala/spark/SparkContext.scala b/core/src/main/scala/spark/SparkContext.scala index 047b57dc1f..f299b7ea46 100644 --- a/core/src/main/scala/spark/SparkContext.scala +++ b/core/src/main/scala/spark/SparkContext.scala @@ -614,14 +614,14 @@ class SparkContext( * Run a job on all partitions in an RDD and return the results in an array. */ def runJob[T, U: ClassManifest](rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): Array[U] = { - runJob(rdd, func, 0 until rdd.splits.size, false) + runJob(rdd, func, 0 until rdd.partitions.size, false) } /** * Run a job on all partitions in an RDD and return the results in an array. */ def runJob[T, U: ClassManifest](rdd: RDD[T], func: Iterator[T] => U): Array[U] = { - runJob(rdd, func, 0 until rdd.splits.size, false) + runJob(rdd, func, 0 until rdd.partitions.size, false) } /** @@ -632,7 +632,7 @@ class SparkContext( processPartition: (TaskContext, Iterator[T]) => U, resultHandler: (Int, U) => Unit) { - runJob[T, U](rdd, processPartition, 0 until rdd.splits.size, false, resultHandler) + runJob[T, U](rdd, processPartition, 0 until rdd.partitions.size, false, resultHandler) } /** @@ -644,7 +644,7 @@ class SparkContext( resultHandler: (Int, U) => Unit) { val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter) - runJob[T, U](rdd, processFunc, 0 until rdd.splits.size, false, resultHandler) + runJob[T, U](rdd, processFunc, 0 until rdd.partitions.size, false, resultHandler) } /** @@ -696,7 +696,7 @@ class SparkContext( /** Default level of parallelism to use when not given by user (e.g. for reduce tasks) */ def defaultParallelism: Int = taskScheduler.defaultParallelism - /** Default min number of splits for Hadoop RDDs when not given by user */ + /** Default min number of partitions for Hadoop RDDs when not given by user */ def defaultMinSplits: Int = math.min(defaultParallelism, 2) private var nextShuffleId = new AtomicInteger(0) diff --git a/core/src/main/scala/spark/Split.scala b/core/src/main/scala/spark/Split.scala deleted file mode 100644 index 90d4b47c55..0000000000 --- a/core/src/main/scala/spark/Split.scala +++ /dev/null @@ -1,14 +0,0 @@ -package spark - -/** - * A partition of an RDD. - */ -trait Split extends Serializable { - /** - * Get the split's index within its parent RDD - */ - def index: Int - - // A better default implementation of HashCode - override def hashCode(): Int = index -} diff --git a/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala b/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala index 2810631b41..da3cb2cd31 100644 --- a/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaDoubleRDD.scala @@ -44,7 +44,7 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav /** * Return a new RDD containing the distinct elements in this RDD. */ - def distinct(numSplits: Int): JavaDoubleRDD = fromRDD(srdd.distinct(numSplits)) + def distinct(numPartitions: Int): JavaDoubleRDD = fromRDD(srdd.distinct(numPartitions)) /** * Return a new RDD containing only the elements that satisfy a predicate. @@ -53,9 +53,9 @@ class JavaDoubleRDD(val srdd: RDD[scala.Double]) extends JavaRDDLike[Double, Jav fromRDD(srdd.filter(x => f(x).booleanValue())) /** - * Return a new RDD that is reduced into `numSplits` partitions. + * Return a new RDD that is reduced into `numPartitions` partitions. */ - def coalesce(numSplits: Int): JavaDoubleRDD = fromRDD(srdd.coalesce(numSplits)) + def coalesce(numPartitions: Int): JavaDoubleRDD = fromRDD(srdd.coalesce(numPartitions)) /** * Return a sampled subset of this RDD. diff --git a/core/src/main/scala/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/spark/api/java/JavaPairRDD.scala index 55dc755358..df3af3817d 100644 --- a/core/src/main/scala/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaPairRDD.scala @@ -54,7 +54,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif /** * Return a new RDD containing the distinct elements in this RDD. */ - def distinct(numSplits: Int): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.distinct(numSplits)) + def distinct(numPartitions: Int): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.distinct(numPartitions)) /** * Return a new RDD containing only the elements that satisfy a predicate. @@ -63,9 +63,9 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif new JavaPairRDD[K, V](rdd.filter(x => f(x).booleanValue())) /** - * Return a new RDD that is reduced into `numSplits` partitions. + * Return a new RDD that is reduced into `numPartitions` partitions. */ - def coalesce(numSplits: Int): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.coalesce(numSplits)) + def coalesce(numPartitions: Int): JavaPairRDD[K, V] = new JavaPairRDD[K, V](rdd.coalesce(numPartitions)) /** * Return a sampled subset of this RDD. @@ -122,8 +122,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif def combineByKey[C](createCombiner: JFunction[V, C], mergeValue: JFunction2[C, V, C], mergeCombiners: JFunction2[C, C, C], - numSplits: Int): JavaPairRDD[K, C] = - combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numSplits)) + numPartitions: Int): JavaPairRDD[K, C] = + combineByKey(createCombiner, mergeValue, mergeCombiners, new HashPartitioner(numPartitions)) /** * Merge the values for each key using an associative reduce function. This will also perform @@ -162,10 +162,10 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif /** * Merge the values for each key using an associative reduce function. This will also perform * the merging locally on each mapper before sending results to a reducer, similarly to a - * "combiner" in MapReduce. Output will be hash-partitioned with numSplits splits. + * "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions. */ - def reduceByKey(func: JFunction2[V, V, V], numSplits: Int): JavaPairRDD[K, V] = - fromRDD(rdd.reduceByKey(func, numSplits)) + def reduceByKey(func: JFunction2[V, V, V], numPartitions: Int): JavaPairRDD[K, V] = + fromRDD(rdd.reduceByKey(func, numPartitions)) /** * Group the values for each key in the RDD into a single sequence. Allows controlling the @@ -176,10 +176,10 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif /** * Group the values for each key in the RDD into a single sequence. Hash-partitions the - * resulting RDD with into `numSplits` partitions. + * resulting RDD with into `numPartitions` partitions. */ - def groupByKey(numSplits: Int): JavaPairRDD[K, JList[V]] = - fromRDD(groupByResultToJava(rdd.groupByKey(numSplits))) + def groupByKey(numPartitions: Int): JavaPairRDD[K, JList[V]] = + fromRDD(groupByResultToJava(rdd.groupByKey(numPartitions))) /** * Return a copy of the RDD partitioned using the specified partitioner. If `mapSideCombine` @@ -261,8 +261,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif * pair of elements will be returned as a (k, (v1, v2)) tuple, where (k, v1) is in `this` and * (k, v2) is in `other`. Performs a hash join across the cluster. */ - def join[W](other: JavaPairRDD[K, W], numSplits: Int): JavaPairRDD[K, (V, W)] = - fromRDD(rdd.join(other, numSplits)) + def join[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (V, W)] = + fromRDD(rdd.join(other, numPartitions)) /** * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the @@ -277,10 +277,10 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the * resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output - * into `numSplits` partitions. + * into `numPartitions` partitions. */ - def leftOuterJoin[W](other: JavaPairRDD[K, W], numSplits: Int): JavaPairRDD[K, (V, Option[W])] = - fromRDD(rdd.leftOuterJoin(other, numSplits)) + def leftOuterJoin[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (V, Option[W])] = + fromRDD(rdd.leftOuterJoin(other, numPartitions)) /** * Perform a right outer join of `this` and `other`. For each element (k, w) in `other`, the @@ -297,8 +297,8 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif * pair (k, (None, w)) if no elements in `this` have key k. Hash-partitions the resulting * RDD into the given number of partitions. */ - def rightOuterJoin[W](other: JavaPairRDD[K, W], numSplits: Int): JavaPairRDD[K, (Option[V], W)] = - fromRDD(rdd.rightOuterJoin(other, numSplits)) + def rightOuterJoin[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (Option[V], W)] = + fromRDD(rdd.rightOuterJoin(other, numPartitions)) /** * Return the key-value pairs in this RDD to the master as a Map. @@ -362,16 +362,16 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kManifest: ClassManif * For each key k in `this` or `other`, return a resulting RDD that contains a tuple with the * list of values for that key in `this` as well as `other`. */ - def cogroup[W](other: JavaPairRDD[K, W], numSplits: Int): JavaPairRDD[K, (JList[V], JList[W])] - = fromRDD(cogroupResultToJava(rdd.cogroup(other, numSplits))) + def cogroup[W](other: JavaPairRDD[K, W], numPartitions: Int): JavaPairRDD[K, (JList[V], JList[W])] + = fromRDD(cogroupResultToJava(rdd.cogroup(other, numPartitions))) /** * For each key k in `this` or `other1` or `other2`, return a resulting RDD that contains a * tuple with the list of values for that key in `this`, `other1` and `other2`. */ - def cogroup[W1, W2](other1: JavaPairRDD[K, W1], other2: JavaPairRDD[K, W2], numSplits: Int) + def cogroup[W1, W2](other1: JavaPairRDD[K, W1], other2: JavaPairRDD[K, W2], numPartitions: Int) : JavaPairRDD[K, (JList[V], JList[W1], JList[W2])] = - fromRDD(cogroupResult2ToJava(rdd.cogroup(other1, other2, numSplits))) + fromRDD(cogroupResult2ToJava(rdd.cogroup(other1, other2, numPartitions))) /** Alias for cogroup. */ def groupWith[W](other: JavaPairRDD[K, W]): JavaPairRDD[K, (JList[V], JList[W])] = diff --git a/core/src/main/scala/spark/api/java/JavaRDD.scala b/core/src/main/scala/spark/api/java/JavaRDD.scala index 23e7ae2726..3ccd6f055e 100644 --- a/core/src/main/scala/spark/api/java/JavaRDD.scala +++ b/core/src/main/scala/spark/api/java/JavaRDD.scala @@ -30,7 +30,7 @@ JavaRDDLike[T, JavaRDD[T]] { /** * Return a new RDD containing the distinct elements in this RDD. */ - def distinct(numSplits: Int): JavaRDD[T] = wrapRDD(rdd.distinct(numSplits)) + def distinct(numPartitions: Int): JavaRDD[T] = wrapRDD(rdd.distinct(numPartitions)) /** * Return a new RDD containing only the elements that satisfy a predicate. @@ -39,9 +39,9 @@ JavaRDDLike[T, JavaRDD[T]] { wrapRDD(rdd.filter((x => f(x).booleanValue()))) /** - * Return a new RDD that is reduced into `numSplits` partitions. + * Return a new RDD that is reduced into `numPartitions` partitions. */ - def coalesce(numSplits: Int): JavaRDD[T] = rdd.coalesce(numSplits) + def coalesce(numPartitions: Int): JavaRDD[T] = rdd.coalesce(numPartitions) /** * Return a sampled subset of this RDD. diff --git a/core/src/main/scala/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/spark/api/java/JavaRDDLike.scala index d34d56d169..90b45cf875 100644 --- a/core/src/main/scala/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/spark/api/java/JavaRDDLike.scala @@ -4,7 +4,7 @@ import java.util.{List => JList} import scala.Tuple2 import scala.collection.JavaConversions._ -import spark.{SparkContext, Split, RDD, TaskContext} +import spark.{SparkContext, Partition, RDD, TaskContext} import spark.api.java.JavaPairRDD._ import spark.api.java.function.{Function2 => JFunction2, Function => JFunction, _} import spark.partial.{PartialResult, BoundedDouble} @@ -20,7 +20,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround def rdd: RDD[T] /** Set of partitions in this RDD. */ - def splits: JList[Split] = new java.util.ArrayList(rdd.splits.toSeq) + def splits: JList[Partition] = new java.util.ArrayList(rdd.partitions.toSeq) /** The [[spark.SparkContext]] that this RDD was created on. */ def context: SparkContext = rdd.context @@ -36,7 +36,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround * This should ''not'' be called by users directly, but is available for implementors of custom * subclasses of RDD. */ - def iterator(split: Split, taskContext: TaskContext): java.util.Iterator[T] = + def iterator(split: Partition, taskContext: TaskContext): java.util.Iterator[T] = asJavaIterator(rdd.iterator(split, taskContext)) // Transformations (return a new RDD) @@ -146,12 +146,12 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. */ - def groupBy[K](f: JFunction[T, K], numSplits: Int): JavaPairRDD[K, JList[T]] = { + def groupBy[K](f: JFunction[T, K], numPartitions: Int): JavaPairRDD[K, JList[T]] = { implicit val kcm: ClassManifest[K] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K]] implicit val vcm: ClassManifest[JList[T]] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[JList[T]]] - JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numSplits)(f.returnType)))(kcm, vcm) + JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numPartitions)(f.returnType)))(kcm, vcm) } /** @@ -333,6 +333,6 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround /** A description of this RDD and its recursive dependencies for debugging. */ def toDebugString(): String = { - rdd.toDebugString() + rdd.toDebugString } } diff --git a/core/src/main/scala/spark/api/python/PythonRDD.scala b/core/src/main/scala/spark/api/python/PythonRDD.scala index ab8351e55e..8c73477384 100644 --- a/core/src/main/scala/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/spark/api/python/PythonRDD.scala @@ -32,11 +32,11 @@ private[spark] class PythonRDD[T: ClassManifest]( this(parent, PipedRDD.tokenize(command), envVars, preservePartitoning, pythonExec, broadcastVars, accumulator) - override def getSplits = parent.splits + override def getPartitions = parent.partitions override val partitioner = if (preservePartitoning) parent.partitioner else None - override def compute(split: Split, context: TaskContext): Iterator[Array[Byte]] = { + override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { val SPARK_HOME = new ProcessBuilder().environment().get("SPARK_HOME") val pb = new ProcessBuilder(Seq(pythonExec, SPARK_HOME + "/python/pyspark/worker.py")) @@ -65,7 +65,7 @@ private[spark] class PythonRDD[T: ClassManifest]( SparkEnv.set(env) val out = new PrintWriter(proc.getOutputStream) val dOut = new DataOutputStream(proc.getOutputStream) - // Split index + // Partition index dOut.writeInt(split.index) // sparkFilesDir PythonRDD.writeAsPickle(SparkFiles.getRootDirectory, dOut) @@ -155,8 +155,8 @@ private class PythonException(msg: String) extends Exception(msg) */ private class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Array[Byte], Array[Byte])](prev) { - override def getSplits = prev.splits - override def compute(split: Split, context: TaskContext) = + override def getPartitions = prev.partitions + override def compute(split: Partition, context: TaskContext) = prev.iterator(split, context).grouped(2).map { case Seq(a, b) => (a, b) case x => throw new Exception("PairwiseRDD: unexpected value: " + x) diff --git a/core/src/main/scala/spark/partial/ApproximateActionListener.scala b/core/src/main/scala/spark/partial/ApproximateActionListener.scala index 24b4909380..de2dce161a 100644 --- a/core/src/main/scala/spark/partial/ApproximateActionListener.scala +++ b/core/src/main/scala/spark/partial/ApproximateActionListener.scala @@ -20,7 +20,7 @@ private[spark] class ApproximateActionListener[T, U, R]( extends JobListener { val startTime = System.currentTimeMillis() - val totalTasks = rdd.splits.size + val totalTasks = rdd.partitions.size var finishedTasks = 0 var failure: Option[Exception] = None // Set if the job has failed (permanently) var resultObject: Option[PartialResult[R]] = None // Set if we've already returned a PartialResult diff --git a/core/src/main/scala/spark/rdd/BlockRDD.scala b/core/src/main/scala/spark/rdd/BlockRDD.scala index 17989c5ce5..7348c4f15b 100644 --- a/core/src/main/scala/spark/rdd/BlockRDD.scala +++ b/core/src/main/scala/spark/rdd/BlockRDD.scala @@ -1,9 +1,9 @@ package spark.rdd import scala.collection.mutable.HashMap -import spark.{RDD, SparkContext, SparkEnv, Split, TaskContext} +import spark.{RDD, SparkContext, SparkEnv, Partition, TaskContext} -private[spark] class BlockRDDSplit(val blockId: String, idx: Int) extends Split { +private[spark] class BlockRDDPartition(val blockId: String, idx: Int) extends Partition { val index = idx } @@ -18,14 +18,14 @@ class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[St HashMap(blockIds.zip(locations):_*) } - override def getSplits: Array[Split] = (0 until blockIds.size).map(i => { - new BlockRDDSplit(blockIds(i), i).asInstanceOf[Split] + override def getPartitions: Array[Partition] = (0 until blockIds.size).map(i => { + new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray - override def compute(split: Split, context: TaskContext): Iterator[T] = { + override def compute(split: Partition, context: TaskContext): Iterator[T] = { val blockManager = SparkEnv.get.blockManager - val blockId = split.asInstanceOf[BlockRDDSplit].blockId + val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.asInstanceOf[Iterator[T]] case None => @@ -33,8 +33,8 @@ class BlockRDD[T: ClassManifest](sc: SparkContext, @transient blockIds: Array[St } } - override def getPreferredLocations(split: Split): Seq[String] = - locations_(split.asInstanceOf[BlockRDDSplit].blockId) + override def getPreferredLocations(split: Partition): Seq[String] = + locations_(split.asInstanceOf[BlockRDDPartition].blockId) } diff --git a/core/src/main/scala/spark/rdd/CartesianRDD.scala b/core/src/main/scala/spark/rdd/CartesianRDD.scala index 41cbbd0093..38600b8be4 100644 --- a/core/src/main/scala/spark/rdd/CartesianRDD.scala +++ b/core/src/main/scala/spark/rdd/CartesianRDD.scala @@ -5,22 +5,22 @@ import spark._ private[spark] -class CartesianSplit( +class CartesianPartition( idx: Int, @transient rdd1: RDD[_], @transient rdd2: RDD[_], s1Index: Int, s2Index: Int - ) extends Split { - var s1 = rdd1.splits(s1Index) - var s2 = rdd2.splits(s2Index) + ) extends Partition { + var s1 = rdd1.partitions(s1Index) + var s2 = rdd2.partitions(s2Index) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream) { // Update the reference to parent split at the time of task serialization - s1 = rdd1.splits(s1Index) - s2 = rdd2.splits(s2Index) + s1 = rdd1.partitions(s1Index) + s2 = rdd2.partitions(s2Index) oos.defaultWriteObject() } } @@ -33,35 +33,35 @@ class CartesianRDD[T: ClassManifest, U:ClassManifest]( extends RDD[Pair[T, U]](sc, Nil) with Serializable { - val numSplitsInRdd2 = rdd2.splits.size + val numPartitionsInRdd2 = rdd2.partitions.size - override def getSplits: Array[Split] = { + override def getPartitions: Array[Partition] = { // create the cross product split - val array = new Array[Split](rdd1.splits.size * rdd2.splits.size) - for (s1 <- rdd1.splits; s2 <- rdd2.splits) { - val idx = s1.index * numSplitsInRdd2 + s2.index - array(idx) = new CartesianSplit(idx, rdd1, rdd2, s1.index, s2.index) + val array = new Array[Partition](rdd1.partitions.size * rdd2.partitions.size) + for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) { + val idx = s1.index * numPartitionsInRdd2 + s2.index + array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index) } array } - override def getPreferredLocations(split: Split): Seq[String] = { - val currSplit = split.asInstanceOf[CartesianSplit] + override def getPreferredLocations(split: Partition): Seq[String] = { + val currSplit = split.asInstanceOf[CartesianPartition] rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2) } - override def compute(split: Split, context: TaskContext) = { - val currSplit = split.asInstanceOf[CartesianSplit] + override def compute(split: Partition, context: TaskContext) = { + val currSplit = split.asInstanceOf[CartesianPartition] for (x <- rdd1.iterator(currSplit.s1, context); y <- rdd2.iterator(currSplit.s2, context)) yield (x, y) } override def getDependencies: Seq[Dependency[_]] = List( new NarrowDependency(rdd1) { - def getParents(id: Int): Seq[Int] = List(id / numSplitsInRdd2) + def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2) }, new NarrowDependency(rdd2) { - def getParents(id: Int): Seq[Int] = List(id % numSplitsInRdd2) + def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2) } ) diff --git a/core/src/main/scala/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/spark/rdd/CheckpointRDD.scala index 3558d4673f..36bfb0355e 100644 --- a/core/src/main/scala/spark/rdd/CheckpointRDD.scala +++ b/core/src/main/scala/spark/rdd/CheckpointRDD.scala @@ -9,7 +9,7 @@ import org.apache.hadoop.fs.Path import java.io.{File, IOException, EOFException} import java.text.NumberFormat -private[spark] class CheckpointRDDSplit(val index: Int) extends Split {} +private[spark] class CheckpointRDDPartition(val index: Int) extends Partition {} /** * This RDD represents a RDD checkpoint file (similar to HadoopRDD). @@ -20,27 +20,27 @@ class CheckpointRDD[T: ClassManifest](sc: SparkContext, val checkpointPath: Stri @transient val fs = new Path(checkpointPath).getFileSystem(sc.hadoopConfiguration) - override def getSplits: Array[Split] = { + override def getPartitions: Array[Partition] = { val dirContents = fs.listStatus(new Path(checkpointPath)) val splitFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted - val numSplits = splitFiles.size + val numPartitions = splitFiles.size if (!splitFiles(0).endsWith(CheckpointRDD.splitIdToFile(0)) || - !splitFiles(numSplits-1).endsWith(CheckpointRDD.splitIdToFile(numSplits-1))) { + !splitFiles(numPartitions-1).endsWith(CheckpointRDD.splitIdToFile(numPartitions-1))) { throw new SparkException("Invalid checkpoint directory: " + checkpointPath) } - Array.tabulate(numSplits)(i => new CheckpointRDDSplit(i)) + Array.tabulate(numPartitions)(i => new CheckpointRDDPartition(i)) } checkpointData = Some(new RDDCheckpointData[T](this)) checkpointData.get.cpFile = Some(checkpointPath) - override def getPreferredLocations(split: Split): Seq[String] = { + override def getPreferredLocations(split: Partition): Seq[String] = { val status = fs.getFileStatus(new Path(checkpointPath)) val locations = fs.getFileBlockLocations(status, 0, status.getLen) locations.headOption.toList.flatMap(_.getHosts).filter(_ != "localhost") } - override def compute(split: Split, context: TaskContext): Iterator[T] = { + override def compute(split: Partition, context: TaskContext): Iterator[T] = { val file = new Path(checkpointPath, CheckpointRDD.splitIdToFile(split.index)) CheckpointRDD.readFromFile(file, context) } @@ -107,7 +107,7 @@ private[spark] object CheckpointRDD extends Logging { deserializeStream.asIterator.asInstanceOf[Iterator[T]] } - // Test whether CheckpointRDD generate expected number of splits despite + // Test whether CheckpointRDD generate expected number of partitions despite // each split file having multiple blocks. This needs to be run on a // cluster (mesos or standalone) using HDFS. def main(args: Array[String]) { @@ -120,8 +120,8 @@ private[spark] object CheckpointRDD extends Logging { val fs = path.getFileSystem(new Configuration()) sc.runJob(rdd, CheckpointRDD.writeToFile(path.toString, 1024) _) val cpRDD = new CheckpointRDD[Int](sc, path.toString) - assert(cpRDD.splits.length == rdd.splits.length, "Number of splits is not the same") - assert(cpRDD.collect.toList == rdd.collect.toList, "Data of splits not the same") + assert(cpRDD.partitions.length == rdd.partitions.length, "Number of partitions is not the same") + assert(cpRDD.collect.toList == rdd.collect.toList, "Data of partitions not the same") fs.delete(path) } } diff --git a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala index 868ee5a39f..5200fb6b65 100644 --- a/core/src/main/scala/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoGroupedRDD.scala @@ -5,7 +5,7 @@ import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions import scala.collection.mutable.ArrayBuffer -import spark.{Aggregator, Logging, Partitioner, RDD, SparkEnv, Split, TaskContext} +import spark.{Aggregator, Logging, Partitioner, RDD, SparkEnv, Partition, TaskContext} import spark.{Dependency, OneToOneDependency, ShuffleDependency} @@ -14,13 +14,13 @@ private[spark] sealed trait CoGroupSplitDep extends Serializable private[spark] case class NarrowCoGroupSplitDep( rdd: RDD[_], splitIndex: Int, - var split: Split + var split: Partition ) extends CoGroupSplitDep { @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream) { // Update the reference to parent split at the time of task serialization - split = rdd.splits(splitIndex) + split = rdd.partitions(splitIndex) oos.defaultWriteObject() } } @@ -28,7 +28,7 @@ private[spark] case class NarrowCoGroupSplitDep( private[spark] case class ShuffleCoGroupSplitDep(shuffleId: Int) extends CoGroupSplitDep private[spark] -class CoGroupSplit(idx: Int, val deps: Seq[CoGroupSplitDep]) extends Split with Serializable { +class CoGroupPartition(idx: Int, val deps: Seq[CoGroupSplitDep]) extends Partition with Serializable { override val index: Int = idx override def hashCode(): Int = idx } @@ -58,17 +58,17 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(K, _)]], part: Partitioner) } } - override def getSplits: Array[Split] = { - val array = new Array[Split](part.numPartitions) + override def getPartitions: Array[Partition] = { + val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.size) { - // Each CoGroupSplit will have a dependency per contributing RDD - array(i) = new CoGroupSplit(i, rdds.zipWithIndex.map { case (rdd, j) => + // Each CoGroupPartition will have a dependency per contributing RDD + array(i) = new CoGroupPartition(i, rdds.zipWithIndex.map { case (rdd, j) => // Assume each RDD contributed a single dependency, and get it dependencies(j) match { case s: ShuffleDependency[_, _] => new ShuffleCoGroupSplitDep(s.shuffleId) case _ => - new NarrowCoGroupSplitDep(rdd, i, rdd.splits(i)) + new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)) } }.toList) } @@ -77,8 +77,8 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[(K, _)]], part: Partitioner) override val partitioner = Some(part) - override def compute(s: Split, context: TaskContext): Iterator[(K, Seq[Seq[_]])] = { - val split = s.asInstanceOf[CoGroupSplit] + override def compute(s: Partition, context: TaskContext): Iterator[(K, Seq[Seq[_]])] = { + val split = s.asInstanceOf[CoGroupPartition] val numRdds = split.deps.size // e.g. for `(k, a) cogroup (k, b)`, K -> Seq(ArrayBuffer as, ArrayBuffer bs) val map = new JHashMap[K, Seq[ArrayBuffer[Any]]] diff --git a/core/src/main/scala/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/spark/rdd/CoalescedRDD.scala index fcd26da43a..0d16cf6e85 100644 --- a/core/src/main/scala/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/spark/rdd/CoalescedRDD.scala @@ -1,19 +1,19 @@ package spark.rdd -import spark.{Dependency, OneToOneDependency, NarrowDependency, RDD, Split, TaskContext} +import spark.{Dependency, OneToOneDependency, NarrowDependency, RDD, Partition, TaskContext} import java.io.{ObjectOutputStream, IOException} -private[spark] case class CoalescedRDDSplit( +private[spark] case class CoalescedRDDPartition( index: Int, @transient rdd: RDD[_], parentsIndices: Array[Int] - ) extends Split { - var parents: Seq[Split] = parentsIndices.map(rdd.splits(_)) + ) extends Partition { + var parents: Seq[Partition] = parentsIndices.map(rdd.partitions(_)) @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream) { // Update the reference to parent split at the time of task serialization - parents = parentsIndices.map(rdd.splits(_)) + parents = parentsIndices.map(rdd.partitions(_)) oos.defaultWriteObject() } } @@ -31,21 +31,21 @@ class CoalescedRDD[T: ClassManifest]( maxPartitions: Int) extends RDD[T](prev.context, Nil) { // Nil since we implement getDependencies - override def getSplits: Array[Split] = { - val prevSplits = prev.splits + override def getPartitions: Array[Partition] = { + val prevSplits = prev.partitions if (prevSplits.length < maxPartitions) { - prevSplits.map(_.index).map{idx => new CoalescedRDDSplit(idx, prev, Array(idx)) } + prevSplits.map(_.index).map{idx => new CoalescedRDDPartition(idx, prev, Array(idx)) } } else { (0 until maxPartitions).map { i => val rangeStart = (i * prevSplits.length) / maxPartitions val rangeEnd = ((i + 1) * prevSplits.length) / maxPartitions - new CoalescedRDDSplit(i, prev, (rangeStart until rangeEnd).toArray) + new CoalescedRDDPartition(i, prev, (rangeStart until rangeEnd).toArray) }.toArray } } - override def compute(split: Split, context: TaskContext): Iterator[T] = { - split.asInstanceOf[CoalescedRDDSplit].parents.iterator.flatMap { parentSplit => + override def compute(split: Partition, context: TaskContext): Iterator[T] = { + split.asInstanceOf[CoalescedRDDPartition].parents.iterator.flatMap { parentSplit => firstParent[T].iterator(parentSplit, context) } } @@ -53,7 +53,7 @@ class CoalescedRDD[T: ClassManifest]( override def getDependencies: Seq[Dependency[_]] = { Seq(new NarrowDependency(prev) { def getParents(id: Int): Seq[Int] = - splits(id).asInstanceOf[CoalescedRDDSplit].parentsIndices + partitions(id).asInstanceOf[CoalescedRDDPartition].parentsIndices }) } diff --git a/core/src/main/scala/spark/rdd/FilteredRDD.scala b/core/src/main/scala/spark/rdd/FilteredRDD.scala index 93e398ea2b..c84ec39d21 100644 --- a/core/src/main/scala/spark/rdd/FilteredRDD.scala +++ b/core/src/main/scala/spark/rdd/FilteredRDD.scala @@ -1,16 +1,16 @@ package spark.rdd -import spark.{OneToOneDependency, RDD, Split, TaskContext} +import spark.{OneToOneDependency, RDD, Partition, TaskContext} private[spark] class FilteredRDD[T: ClassManifest]( prev: RDD[T], f: T => Boolean) extends RDD[T](prev) { - override def getSplits: Array[Split] = firstParent[T].splits + override def getPartitions: Array[Partition] = firstParent[T].partitions override val partitioner = prev.partitioner // Since filter cannot change a partition's keys - override def compute(split: Split, context: TaskContext) = + override def compute(split: Partition, context: TaskContext) = firstParent[T].iterator(split, context).filter(f) } diff --git a/core/src/main/scala/spark/rdd/FlatMappedRDD.scala b/core/src/main/scala/spark/rdd/FlatMappedRDD.scala index 8c2a610593..8ebc778925 100644 --- a/core/src/main/scala/spark/rdd/FlatMappedRDD.scala +++ b/core/src/main/scala/spark/rdd/FlatMappedRDD.scala @@ -1,6 +1,6 @@ package spark.rdd -import spark.{RDD, Split, TaskContext} +import spark.{RDD, Partition, TaskContext} private[spark] @@ -9,8 +9,8 @@ class FlatMappedRDD[U: ClassManifest, T: ClassManifest]( f: T => TraversableOnce[U]) extends RDD[U](prev) { - override def getSplits: Array[Split] = firstParent[T].splits + override def getPartitions: Array[Partition] = firstParent[T].partitions - override def compute(split: Split, context: TaskContext) = + override def compute(split: Partition, context: TaskContext) = firstParent[T].iterator(split, context).flatMap(f) } diff --git a/core/src/main/scala/spark/rdd/GlommedRDD.scala b/core/src/main/scala/spark/rdd/GlommedRDD.scala index 70b9b4e34e..e16c7ba881 100644 --- a/core/src/main/scala/spark/rdd/GlommedRDD.scala +++ b/core/src/main/scala/spark/rdd/GlommedRDD.scala @@ -1,12 +1,12 @@ package spark.rdd -import spark.{RDD, Split, TaskContext} +import spark.{RDD, Partition, TaskContext} private[spark] class GlommedRDD[T: ClassManifest](prev: RDD[T]) extends RDD[Array[T]](prev) { - override def getSplits: Array[Split] = firstParent[T].splits + override def getPartitions: Array[Partition] = firstParent[T].partitions - override def compute(split: Split, context: TaskContext) = + override def compute(split: Partition, context: TaskContext) = Array(firstParent[T].iterator(split, context).toArray).iterator } diff --git a/core/src/main/scala/spark/rdd/HadoopRDD.scala b/core/src/main/scala/spark/rdd/HadoopRDD.scala index 854993737b..8139a2a40c 100644 --- a/core/src/main/scala/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/spark/rdd/HadoopRDD.scala @@ -15,14 +15,14 @@ import org.apache.hadoop.mapred.RecordReader import org.apache.hadoop.mapred.Reporter import org.apache.hadoop.util.ReflectionUtils -import spark.{Dependency, RDD, SerializableWritable, SparkContext, Split, TaskContext} +import spark.{Dependency, RDD, SerializableWritable, SparkContext, Partition, TaskContext} /** * A Spark split class that wraps around a Hadoop InputSplit. */ -private[spark] class HadoopSplit(rddId: Int, idx: Int, @transient s: InputSplit) - extends Split { +private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSplit) + extends Partition { val inputSplit = new SerializableWritable[InputSplit](s) @@ -47,12 +47,12 @@ class HadoopRDD[K, V]( // A Hadoop JobConf can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = sc.broadcast(new SerializableWritable(conf)) - override def getSplits: Array[Split] = { + override def getPartitions: Array[Partition] = { val inputFormat = createInputFormat(conf) val inputSplits = inputFormat.getSplits(conf, minSplits) - val array = new Array[Split](inputSplits.size) + val array = new Array[Partition](inputSplits.size) for (i <- 0 until inputSplits.size) { - array(i) = new HadoopSplit(id, i, inputSplits(i)) + array(i) = new HadoopPartition(id, i, inputSplits(i)) } array } @@ -62,8 +62,8 @@ class HadoopRDD[K, V]( .asInstanceOf[InputFormat[K, V]] } - override def compute(theSplit: Split, context: TaskContext) = new Iterator[(K, V)] { - val split = theSplit.asInstanceOf[HadoopSplit] + override def compute(theSplit: Partition, context: TaskContext) = new Iterator[(K, V)] { + val split = theSplit.asInstanceOf[HadoopPartition] var reader: RecordReader[K, V] = null val conf = confBroadcast.value.value @@ -106,9 +106,9 @@ class HadoopRDD[K, V]( } } - override def getPreferredLocations(split: Split): Seq[String] = { + override def getPreferredLocations(split: Partition): Seq[String] = { // TODO: Filtering out "localhost" in case of file:// URLs - val hadoopSplit = split.asInstanceOf[HadoopSplit] + val hadoopSplit = split.asInstanceOf[HadoopPartition] hadoopSplit.inputSplit.value.getLocations.filter(_ != "localhost") } diff --git a/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala b/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala index 7b0b4525c7..d283c5b2bb 100644 --- a/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala +++ b/core/src/main/scala/spark/rdd/MapPartitionsRDD.scala @@ -1,6 +1,6 @@ package spark.rdd -import spark.{RDD, Split, TaskContext} +import spark.{RDD, Partition, TaskContext} private[spark] @@ -13,8 +13,8 @@ class MapPartitionsRDD[U: ClassManifest, T: ClassManifest]( override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None - override def getSplits: Array[Split] = firstParent[T].splits + override def getPartitions: Array[Partition] = firstParent[T].partitions - override def compute(split: Split, context: TaskContext) = + override def compute(split: Partition, context: TaskContext) = f(firstParent[T].iterator(split, context)) -} \ No newline at end of file +} diff --git a/core/src/main/scala/spark/rdd/MapPartitionsWithIndexRDD.scala b/core/src/main/scala/spark/rdd/MapPartitionsWithIndexRDD.scala new file mode 100644 index 0000000000..afb7504ba1 --- /dev/null +++ b/core/src/main/scala/spark/rdd/MapPartitionsWithIndexRDD.scala @@ -0,0 +1,24 @@ +package spark.rdd + +import spark.{RDD, Partition, TaskContext} + + +/** + * A variant of the MapPartitionsRDD that passes the partition index into the + * closure. This can be used to generate or collect partition specific + * information such as the number of tuples in a partition. + */ +private[spark] +class MapPartitionsWithIndexRDD[U: ClassManifest, T: ClassManifest]( + prev: RDD[T], + f: (Int, Iterator[T]) => Iterator[U], + preservesPartitioning: Boolean + ) extends RDD[U](prev) { + + override def getPartitions: Array[Partition] = firstParent[T].partitions + + override val partitioner = if (preservesPartitioning) prev.partitioner else None + + override def compute(split: Partition, context: TaskContext) = + f(split.index, firstParent[T].iterator(split, context)) +} diff --git a/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala b/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala deleted file mode 100644 index c6dc1080a9..0000000000 --- a/core/src/main/scala/spark/rdd/MapPartitionsWithSplitRDD.scala +++ /dev/null @@ -1,24 +0,0 @@ -package spark.rdd - -import spark.{RDD, Split, TaskContext} - - -/** - * A variant of the MapPartitionsRDD that passes the split index into the - * closure. This can be used to generate or collect partition specific - * information such as the number of tuples in a partition. - */ -private[spark] -class MapPartitionsWithSplitRDD[U: ClassManifest, T: ClassManifest]( - prev: RDD[T], - f: (Int, Iterator[T]) => Iterator[U], - preservesPartitioning: Boolean - ) extends RDD[U](prev) { - - override def getSplits: Array[Split] = firstParent[T].splits - - override val partitioner = if (preservesPartitioning) prev.partitioner else None - - override def compute(split: Split, context: TaskContext) = - f(split.index, firstParent[T].iterator(split, context)) -} \ No newline at end of file diff --git a/core/src/main/scala/spark/rdd/MappedRDD.scala b/core/src/main/scala/spark/rdd/MappedRDD.scala index 6074f411e3..af07311b6d 100644 --- a/core/src/main/scala/spark/rdd/MappedRDD.scala +++ b/core/src/main/scala/spark/rdd/MappedRDD.scala @@ -1,13 +1,13 @@ package spark.rdd -import spark.{RDD, Split, TaskContext} +import spark.{RDD, Partition, TaskContext} private[spark] class MappedRDD[U: ClassManifest, T: ClassManifest](prev: RDD[T], f: T => U) extends RDD[U](prev) { - override def getSplits: Array[Split] = firstParent[T].splits + override def getPartitions: Array[Partition] = firstParent[T].partitions - override def compute(split: Split, context: TaskContext) = + override def compute(split: Partition, context: TaskContext) = firstParent[T].iterator(split, context).map(f) } diff --git a/core/src/main/scala/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/spark/rdd/NewHadoopRDD.scala index 345ae79d74..ebd4c3f0e2 100644 --- a/core/src/main/scala/spark/rdd/NewHadoopRDD.scala +++ b/core/src/main/scala/spark/rdd/NewHadoopRDD.scala @@ -7,12 +7,12 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ -import spark.{Dependency, RDD, SerializableWritable, SparkContext, Split, TaskContext} +import spark.{Dependency, RDD, SerializableWritable, SparkContext, Partition, TaskContext} private[spark] -class NewHadoopSplit(rddId: Int, val index: Int, @transient rawSplit: InputSplit with Writable) - extends Split { +class NewHadoopPartition(rddId: Int, val index: Int, @transient rawSplit: InputSplit with Writable) + extends Partition { val serializableHadoopSplit = new SerializableWritable(rawSplit) @@ -39,19 +39,19 @@ class NewHadoopRDD[K, V]( @transient private val jobId = new JobID(jobtrackerId, id) - override def getSplits: Array[Split] = { + override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val jobContext = newJobContext(conf, jobId) val rawSplits = inputFormat.getSplits(jobContext).toArray - val result = new Array[Split](rawSplits.size) + val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { - result(i) = new NewHadoopSplit(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) + result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } - override def compute(theSplit: Split, context: TaskContext) = new Iterator[(K, V)] { - val split = theSplit.asInstanceOf[NewHadoopSplit] + override def compute(theSplit: Partition, context: TaskContext) = new Iterator[(K, V)] { + val split = theSplit.asInstanceOf[NewHadoopPartition] val conf = confBroadcast.value.value val attemptId = new TaskAttemptID(jobtrackerId, id, true, split.index, 0) val hadoopAttemptContext = newTaskAttemptContext(conf, attemptId) @@ -83,8 +83,8 @@ class NewHadoopRDD[K, V]( } } - override def getPreferredLocations(split: Split): Seq[String] = { - val theSplit = split.asInstanceOf[NewHadoopSplit] + override def getPreferredLocations(split: Partition): Seq[String] = { + val theSplit = split.asInstanceOf[NewHadoopPartition] theSplit.serializableHadoopSplit.value.getLocations.filter(_ != "localhost") } } diff --git a/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala index e703794787..07585a88ce 100644 --- a/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala +++ b/core/src/main/scala/spark/rdd/ParallelCollectionRDD.scala @@ -3,20 +3,20 @@ package spark.rdd import scala.collection.immutable.NumericRange import scala.collection.mutable.ArrayBuffer import scala.collection.Map -import spark.{RDD, TaskContext, SparkContext, Split} +import spark.{RDD, TaskContext, SparkContext, Partition} -private[spark] class ParallelCollectionSplit[T: ClassManifest]( +private[spark] class ParallelCollectionPartition[T: ClassManifest]( val rddId: Long, val slice: Int, values: Seq[T]) - extends Split with Serializable { + extends Partition with Serializable { def iterator: Iterator[T] = values.iterator override def hashCode(): Int = (41 * (41 + rddId) + slice).toInt override def equals(other: Any): Boolean = other match { - case that: ParallelCollectionSplit[_] => (this.rddId == that.rddId && this.slice == that.slice) + case that: ParallelCollectionPartition[_] => (this.rddId == that.rddId && this.slice == that.slice) case _ => false } @@ -34,15 +34,15 @@ private[spark] class ParallelCollectionRDD[T: ClassManifest]( // instead. // UPDATE: A parallel collection can be checkpointed to HDFS, which achieves this goal. - override def getSplits: Array[Split] = { + override def getPartitions: Array[Partition] = { val slices = ParallelCollectionRDD.slice(data, numSlices).toArray - slices.indices.map(i => new ParallelCollectionSplit(id, i, slices(i))).toArray + slices.indices.map(i => new ParallelCollectionPartition(id, i, slices(i))).toArray } - override def compute(s: Split, context: TaskContext) = - s.asInstanceOf[ParallelCollectionSplit[T]].iterator + override def compute(s: Partition, context: TaskContext) = + s.asInstanceOf[ParallelCollectionPartition[T]].iterator - override def getPreferredLocations(s: Split): Seq[String] = { + override def getPreferredLocations(s: Partition): Seq[String] = { locationPrefs.getOrElse(s.index, Nil) } } diff --git a/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala index d1553181c1..f2f4fd56d1 100644 --- a/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala +++ b/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala @@ -1,9 +1,9 @@ package spark.rdd -import spark.{NarrowDependency, RDD, SparkEnv, Split, TaskContext} +import spark.{NarrowDependency, RDD, SparkEnv, Partition, TaskContext} -class PartitionPruningRDDSplit(idx: Int, val parentSplit: Split) extends Split { +class PartitionPruningRDDPartition(idx: Int, val parentSplit: Partition) extends Partition { override val index = idx } @@ -16,15 +16,15 @@ class PruneDependency[T](rdd: RDD[T], @transient partitionFilterFunc: Int => Boo extends NarrowDependency[T](rdd) { @transient - val partitions: Array[Split] = rdd.splits.filter(s => partitionFilterFunc(s.index)) - .zipWithIndex.map { case(split, idx) => new PartitionPruningRDDSplit(idx, split) : Split } + val partitions: Array[Partition] = rdd.partitions.filter(s => partitionFilterFunc(s.index)) + .zipWithIndex.map { case(split, idx) => new PartitionPruningRDDPartition(idx, split) : Partition } override def getParents(partitionId: Int) = List(partitions(partitionId).index) } /** - * A RDD used to prune RDD partitions/splits so we can avoid launching tasks on + * A RDD used to prune RDD partitions/partitions so we can avoid launching tasks on * all partitions. An example use case: If we know the RDD is partitioned by range, * and the execution DAG has a filter on the key, we can avoid launching tasks * on partitions that don't have the range covering the key. @@ -34,9 +34,9 @@ class PartitionPruningRDD[T: ClassManifest]( @transient partitionFilterFunc: Int => Boolean) extends RDD[T](prev.context, List(new PruneDependency(prev, partitionFilterFunc))) { - override def compute(split: Split, context: TaskContext) = firstParent[T].iterator( - split.asInstanceOf[PartitionPruningRDDSplit].parentSplit, context) + override def compute(split: Partition, context: TaskContext) = firstParent[T].iterator( + split.asInstanceOf[PartitionPruningRDDPartition].parentSplit, context) - override protected def getSplits: Array[Split] = + override protected def getPartitions: Array[Partition] = getDependencies.head.asInstanceOf[PruneDependency[T]].partitions } diff --git a/core/src/main/scala/spark/rdd/PipedRDD.scala b/core/src/main/scala/spark/rdd/PipedRDD.scala index 56032a8659..962a1b21ad 100644 --- a/core/src/main/scala/spark/rdd/PipedRDD.scala +++ b/core/src/main/scala/spark/rdd/PipedRDD.scala @@ -8,7 +8,7 @@ import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.io.Source -import spark.{RDD, SparkEnv, Split, TaskContext} +import spark.{RDD, SparkEnv, Partition, TaskContext} /** @@ -27,9 +27,9 @@ class PipedRDD[T: ClassManifest]( // using a standard StringTokenizer (i.e. by spaces) def this(prev: RDD[T], command: String) = this(prev, PipedRDD.tokenize(command)) - override def getSplits: Array[Split] = firstParent[T].splits + override def getPartitions: Array[Partition] = firstParent[T].partitions - override def compute(split: Split, context: TaskContext): Iterator[String] = { + override def compute(split: Partition, context: TaskContext): Iterator[String] = { val pb = new ProcessBuilder(command) // Add the environmental variables to the process. val currentEnvVars = pb.environment() diff --git a/core/src/main/scala/spark/rdd/SampledRDD.scala b/core/src/main/scala/spark/rdd/SampledRDD.scala index f2a144e2e0..243673f151 100644 --- a/core/src/main/scala/spark/rdd/SampledRDD.scala +++ b/core/src/main/scala/spark/rdd/SampledRDD.scala @@ -5,10 +5,10 @@ import java.util.Random import cern.jet.random.Poisson import cern.jet.random.engine.DRand -import spark.{RDD, Split, TaskContext} +import spark.{RDD, Partition, TaskContext} private[spark] -class SampledRDDSplit(val prev: Split, val seed: Int) extends Split with Serializable { +class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @@ -19,16 +19,16 @@ class SampledRDD[T: ClassManifest]( seed: Int) extends RDD[T](prev) { - override def getSplits: Array[Split] = { + override def getPartitions: Array[Partition] = { val rg = new Random(seed) - firstParent[T].splits.map(x => new SampledRDDSplit(x, rg.nextInt)) + firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } - override def getPreferredLocations(split: Split): Seq[String] = - firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDSplit].prev) + override def getPreferredLocations(split: Partition): Seq[String] = + firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) - override def compute(splitIn: Split, context: TaskContext): Iterator[T] = { - val split = splitIn.asInstanceOf[SampledRDDSplit] + override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { + val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. diff --git a/core/src/main/scala/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/spark/rdd/ShuffledRDD.scala index bf69b5150b..c2f118305f 100644 --- a/core/src/main/scala/spark/rdd/ShuffledRDD.scala +++ b/core/src/main/scala/spark/rdd/ShuffledRDD.scala @@ -1,9 +1,9 @@ package spark.rdd -import spark.{Partitioner, RDD, SparkEnv, ShuffleDependency, Split, TaskContext} +import spark.{Partitioner, RDD, SparkEnv, ShuffleDependency, Partition, TaskContext} import spark.SparkContext._ -private[spark] class ShuffledRDDSplit(val idx: Int) extends Split { +private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index = idx override def hashCode(): Int = idx } @@ -22,11 +22,11 @@ class ShuffledRDD[K, V]( override val partitioner = Some(part) - override def getSplits: Array[Split] = { - Array.tabulate[Split](part.numPartitions)(i => new ShuffledRDDSplit(i)) + override def getPartitions: Array[Partition] = { + Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } - override def compute(split: Split, context: TaskContext): Iterator[(K, V)] = { + override def compute(split: Partition, context: TaskContext): Iterator[(K, V)] = { val shuffledId = dependencies.head.asInstanceOf[ShuffleDependency[K, V]].shuffleId SparkEnv.get.shuffleFetcher.fetch[K, V](shuffledId, split.index) } diff --git a/core/src/main/scala/spark/rdd/UnionRDD.scala b/core/src/main/scala/spark/rdd/UnionRDD.scala index ebc0068228..2c52a67e22 100644 --- a/core/src/main/scala/spark/rdd/UnionRDD.scala +++ b/core/src/main/scala/spark/rdd/UnionRDD.scala @@ -1,13 +1,13 @@ package spark.rdd import scala.collection.mutable.ArrayBuffer -import spark.{Dependency, RangeDependency, RDD, SparkContext, Split, TaskContext} +import spark.{Dependency, RangeDependency, RDD, SparkContext, Partition, TaskContext} import java.io.{ObjectOutputStream, IOException} -private[spark] class UnionSplit[T: ClassManifest](idx: Int, rdd: RDD[T], splitIndex: Int) - extends Split { +private[spark] class UnionPartition[T: ClassManifest](idx: Int, rdd: RDD[T], splitIndex: Int) + extends Partition { - var split: Split = rdd.splits(splitIndex) + var split: Partition = rdd.partitions(splitIndex) def iterator(context: TaskContext) = rdd.iterator(split, context) @@ -18,7 +18,7 @@ private[spark] class UnionSplit[T: ClassManifest](idx: Int, rdd: RDD[T], splitIn @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream) { // Update the reference to parent split at the time of task serialization - split = rdd.splits(splitIndex) + split = rdd.partitions(splitIndex) oos.defaultWriteObject() } } @@ -28,11 +28,11 @@ class UnionRDD[T: ClassManifest]( @transient var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies - override def getSplits: Array[Split] = { - val array = new Array[Split](rdds.map(_.splits.size).sum) + override def getPartitions: Array[Partition] = { + val array = new Array[Partition](rdds.map(_.partitions.size).sum) var pos = 0 - for (rdd <- rdds; split <- rdd.splits) { - array(pos) = new UnionSplit(pos, rdd, split.index) + for (rdd <- rdds; split <- rdd.partitions) { + array(pos) = new UnionPartition(pos, rdd, split.index) pos += 1 } array @@ -42,15 +42,15 @@ class UnionRDD[T: ClassManifest]( val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { - deps += new RangeDependency(rdd, 0, pos, rdd.splits.size) - pos += rdd.splits.size + deps += new RangeDependency(rdd, 0, pos, rdd.partitions.size) + pos += rdd.partitions.size } deps } - override def compute(s: Split, context: TaskContext): Iterator[T] = - s.asInstanceOf[UnionSplit[T]].iterator(context) + override def compute(s: Partition, context: TaskContext): Iterator[T] = + s.asInstanceOf[UnionPartition[T]].iterator(context) - override def getPreferredLocations(s: Split): Seq[String] = - s.asInstanceOf[UnionSplit[T]].preferredLocations() + override def getPreferredLocations(s: Partition): Seq[String] = + s.asInstanceOf[UnionPartition[T]].preferredLocations() } diff --git a/core/src/main/scala/spark/rdd/ZippedRDD.scala b/core/src/main/scala/spark/rdd/ZippedRDD.scala index 1ce70268bb..e80ec17aa5 100644 --- a/core/src/main/scala/spark/rdd/ZippedRDD.scala +++ b/core/src/main/scala/spark/rdd/ZippedRDD.scala @@ -1,17 +1,17 @@ package spark.rdd -import spark.{OneToOneDependency, RDD, SparkContext, Split, TaskContext} +import spark.{OneToOneDependency, RDD, SparkContext, Partition, TaskContext} import java.io.{ObjectOutputStream, IOException} -private[spark] class ZippedSplit[T: ClassManifest, U: ClassManifest]( +private[spark] class ZippedPartition[T: ClassManifest, U: ClassManifest]( idx: Int, @transient rdd1: RDD[T], @transient rdd2: RDD[U] - ) extends Split { + ) extends Partition { - var split1 = rdd1.splits(idx) - var split2 = rdd1.splits(idx) + var split1 = rdd1.partitions(idx) + var split2 = rdd1.partitions(idx) override val index: Int = idx def splits = (split1, split2) @@ -19,8 +19,8 @@ private[spark] class ZippedSplit[T: ClassManifest, U: ClassManifest]( @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream) { // Update the reference to parent split at the time of task serialization - split1 = rdd1.splits(idx) - split2 = rdd2.splits(idx) + split1 = rdd1.partitions(idx) + split2 = rdd2.partitions(idx) oos.defaultWriteObject() } } @@ -31,24 +31,24 @@ class ZippedRDD[T: ClassManifest, U: ClassManifest]( var rdd2: RDD[U]) extends RDD[(T, U)](sc, List(new OneToOneDependency(rdd1), new OneToOneDependency(rdd2))) { - override def getSplits: Array[Split] = { - if (rdd1.splits.size != rdd2.splits.size) { + override def getPartitions: Array[Partition] = { + if (rdd1.partitions.size != rdd2.partitions.size) { throw new IllegalArgumentException("Can't zip RDDs with unequal numbers of partitions") } - val array = new Array[Split](rdd1.splits.size) - for (i <- 0 until rdd1.splits.size) { - array(i) = new ZippedSplit(i, rdd1, rdd2) + val array = new Array[Partition](rdd1.partitions.size) + for (i <- 0 until rdd1.partitions.size) { + array(i) = new ZippedPartition(i, rdd1, rdd2) } array } - override def compute(s: Split, context: TaskContext): Iterator[(T, U)] = { - val (split1, split2) = s.asInstanceOf[ZippedSplit[T, U]].splits + override def compute(s: Partition, context: TaskContext): Iterator[(T, U)] = { + val (split1, split2) = s.asInstanceOf[ZippedPartition[T, U]].splits rdd1.iterator(split1, context).zip(rdd2.iterator(split2, context)) } - override def getPreferredLocations(s: Split): Seq[String] = { - val (split1, split2) = s.asInstanceOf[ZippedSplit[T, U]].splits + override def getPreferredLocations(s: Partition): Seq[String] = { + val (split1, split2) = s.asInstanceOf[ZippedPartition[T, U]].splits rdd1.preferredLocations(split1).intersect(rdd2.preferredLocations(split2)) } diff --git a/core/src/main/scala/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/spark/scheduler/DAGScheduler.scala index 319eef6978..bf0837c066 100644 --- a/core/src/main/scala/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/spark/scheduler/DAGScheduler.scala @@ -106,7 +106,7 @@ class DAGScheduler( private def getCacheLocs(rdd: RDD[_]): Array[List[String]] = { if (!cacheLocs.contains(rdd.id)) { - val blockIds = rdd.splits.indices.map(index=> "rdd_%d_%d".format(rdd.id, index)).toArray + val blockIds = rdd.partitions.indices.map(index=> "rdd_%d_%d".format(rdd.id, index)).toArray cacheLocs(rdd.id) = blockManagerMaster.getLocations(blockIds).map { locations => locations.map(_.ip).toList }.toArray @@ -141,9 +141,9 @@ class DAGScheduler( private def newStage(rdd: RDD[_], shuffleDep: Option[ShuffleDependency[_,_]], priority: Int): Stage = { if (shuffleDep != None) { // Kind of ugly: need to register RDDs with the cache and map output tracker here - // since we can't do it in the RDD constructor because # of splits is unknown + // since we can't do it in the RDD constructor because # of partitions is unknown logInfo("Registering RDD " + rdd.id + " (" + rdd.origin + ")") - mapOutputTracker.registerShuffle(shuffleDep.get.shuffleId, rdd.splits.size) + mapOutputTracker.registerShuffle(shuffleDep.get.shuffleId, rdd.partitions.size) } val id = nextStageId.getAndIncrement() val stage = new Stage(id, rdd, shuffleDep, getParentStages(rdd, priority), priority) @@ -162,7 +162,7 @@ class DAGScheduler( if (!visited(r)) { visited += r // Kind of ugly: need to register RDDs with the cache here since - // we can't do it in its constructor because # of splits is unknown + // we can't do it in its constructor because # of partitions is unknown for (dep <- r.dependencies) { dep match { case shufDep: ShuffleDependency[_,_] => @@ -257,7 +257,7 @@ class DAGScheduler( { val listener = new ApproximateActionListener(rdd, func, evaluator, timeout) val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _] - val partitions = (0 until rdd.splits.size).toArray + val partitions = (0 until rdd.partitions.size).toArray eventQueue.put(JobSubmitted(rdd, func2, partitions, false, callSite, listener)) return listener.awaitResult() // Will throw an exception if the job fails } @@ -386,7 +386,7 @@ class DAGScheduler( try { SparkEnv.set(env) val rdd = job.finalStage.rdd - val split = rdd.splits(job.partitions(0)) + val split = rdd.partitions(job.partitions(0)) val taskContext = new TaskContext(job.finalStage.id, job.partitions(0), 0) try { val result = job.func(taskContext, rdd.iterator(split, taskContext)) @@ -672,7 +672,7 @@ class DAGScheduler( return cached } // If the RDD has some placement preferences (as is the case for input RDDs), get those - val rddPrefs = rdd.preferredLocations(rdd.splits(partition)).toList + val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList if (rddPrefs != Nil) { return rddPrefs } diff --git a/core/src/main/scala/spark/scheduler/ResultTask.scala b/core/src/main/scala/spark/scheduler/ResultTask.scala index 8cd4c661eb..1721f78f48 100644 --- a/core/src/main/scala/spark/scheduler/ResultTask.scala +++ b/core/src/main/scala/spark/scheduler/ResultTask.scala @@ -67,7 +67,7 @@ private[spark] class ResultTask[T, U]( var split = if (rdd == null) { null } else { - rdd.splits(partition) + rdd.partitions(partition) } override def run(attemptId: Long): U = { @@ -85,7 +85,7 @@ private[spark] class ResultTask[T, U]( override def writeExternal(out: ObjectOutput) { RDDCheckpointData.synchronized { - split = rdd.splits(partition) + split = rdd.partitions(partition) out.writeInt(stageId) val bytes = ResultTask.serializeInfo( stageId, rdd, func.asInstanceOf[(TaskContext, Iterator[_]) => _]) @@ -107,6 +107,6 @@ private[spark] class ResultTask[T, U]( func = func_.asInstanceOf[(TaskContext, Iterator[T]) => U] partition = in.readInt() val outputId = in.readInt() - split = in.readObject().asInstanceOf[Split] + split = in.readObject().asInstanceOf[Partition] } } diff --git a/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala index bed9f1864f..59ee3c0a09 100644 --- a/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala +++ b/core/src/main/scala/spark/scheduler/ShuffleMapTask.scala @@ -86,12 +86,12 @@ private[spark] class ShuffleMapTask( var split = if (rdd == null) { null } else { - rdd.splits(partition) + rdd.partitions(partition) } override def writeExternal(out: ObjectOutput) { RDDCheckpointData.synchronized { - split = rdd.splits(partition) + split = rdd.partitions(partition) out.writeInt(stageId) val bytes = ShuffleMapTask.serializeInfo(stageId, rdd, dep) out.writeInt(bytes.length) @@ -112,7 +112,7 @@ private[spark] class ShuffleMapTask( dep = dep_ partition = in.readInt() generation = in.readLong() - split = in.readObject().asInstanceOf[Split] + split = in.readObject().asInstanceOf[Partition] } override def run(attemptId: Long): MapStatus = { diff --git a/core/src/main/scala/spark/scheduler/Stage.scala b/core/src/main/scala/spark/scheduler/Stage.scala index 374114d870..552061e46b 100644 --- a/core/src/main/scala/spark/scheduler/Stage.scala +++ b/core/src/main/scala/spark/scheduler/Stage.scala @@ -28,7 +28,7 @@ private[spark] class Stage( extends Logging { val isShuffleMap = shuffleDep != None - val numPartitions = rdd.splits.size + val numPartitions = rdd.partitions.size val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil) var numAvailableOutputs = 0 diff --git a/core/src/main/scala/spark/storage/BlockManager.scala b/core/src/main/scala/spark/storage/BlockManager.scala index 2e7db60841..2462721fb8 100644 --- a/core/src/main/scala/spark/storage/BlockManager.scala +++ b/core/src/main/scala/spark/storage/BlockManager.scala @@ -513,7 +513,7 @@ class BlockManager( } } - // Split local and remote blocks. Remote blocks are further split into FetchRequests of size + // Partition local and remote blocks. Remote blocks are further split into FetchRequests of size // at most maxBytesInFlight in order to limit the amount of data in flight. val remoteRequests = new ArrayBuffer[FetchRequest] for ((address, blockInfos) <- blocksByAddress) { diff --git a/core/src/main/scala/spark/storage/StorageUtils.scala b/core/src/main/scala/spark/storage/StorageUtils.scala index 5f72b67b2c..dec47a9d41 100644 --- a/core/src/main/scala/spark/storage/StorageUtils.scala +++ b/core/src/main/scala/spark/storage/StorageUtils.scala @@ -63,7 +63,7 @@ object StorageUtils { val rddName = Option(rdd.name).getOrElse(rddKey) val rddStorageLevel = rdd.getStorageLevel - RDDInfo(rddId, rddName, rddStorageLevel, rddBlocks.length, rdd.splits.size, memSize, diskSize) + RDDInfo(rddId, rddName, rddStorageLevel, rddBlocks.length, rdd.partitions.size, memSize, diskSize) }.toArray } diff --git a/core/src/test/scala/spark/CheckpointSuite.scala b/core/src/test/scala/spark/CheckpointSuite.scala index 51ff966ae4..3e5ffa81d6 100644 --- a/core/src/test/scala/spark/CheckpointSuite.scala +++ b/core/src/test/scala/spark/CheckpointSuite.scala @@ -34,7 +34,7 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { testCheckpointing(_.sample(false, 0.5, 0)) testCheckpointing(_.glom()) testCheckpointing(_.mapPartitions(_.map(_.toString))) - testCheckpointing(r => new MapPartitionsWithSplitRDD(r, + testCheckpointing(r => new MapPartitionsWithIndexRDD(r, (i: Int, iter: Iterator[Int]) => iter.map(_.toString), false )) testCheckpointing(_.map(x => (x % 2, 1)).reduceByKey(_ + _).mapValues(_.toString)) testCheckpointing(_.map(x => (x % 2, 1)).reduceByKey(_ + _).flatMapValues(x => 1 to x)) @@ -43,14 +43,14 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { test("ParallelCollection") { val parCollection = sc.makeRDD(1 to 4, 2) - val numSplits = parCollection.splits.size + val numPartitions = parCollection.partitions.size parCollection.checkpoint() assert(parCollection.dependencies === Nil) val result = parCollection.collect() assert(sc.checkpointFile[Int](parCollection.getCheckpointFile.get).collect() === result) assert(parCollection.dependencies != Nil) - assert(parCollection.splits.length === numSplits) - assert(parCollection.splits.toList === parCollection.checkpointData.get.getSplits.toList) + assert(parCollection.partitions.length === numPartitions) + assert(parCollection.partitions.toList === parCollection.checkpointData.get.getPartitions.toList) assert(parCollection.collect() === result) } @@ -59,13 +59,13 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { val blockManager = SparkEnv.get.blockManager blockManager.putSingle(blockId, "test", StorageLevel.MEMORY_ONLY) val blockRDD = new BlockRDD[String](sc, Array(blockId)) - val numSplits = blockRDD.splits.size + val numPartitions = blockRDD.partitions.size blockRDD.checkpoint() val result = blockRDD.collect() assert(sc.checkpointFile[String](blockRDD.getCheckpointFile.get).collect() === result) assert(blockRDD.dependencies != Nil) - assert(blockRDD.splits.length === numSplits) - assert(blockRDD.splits.toList === blockRDD.checkpointData.get.getSplits.toList) + assert(blockRDD.partitions.length === numPartitions) + assert(blockRDD.partitions.toList === blockRDD.checkpointData.get.getPartitions.toList) assert(blockRDD.collect() === result) } @@ -79,9 +79,9 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { test("UnionRDD") { def otherRDD = sc.makeRDD(1 to 10, 1) - // Test whether the size of UnionRDDSplits reduce in size after parent RDD is checkpointed. + // Test whether the size of UnionRDDPartitions reduce in size after parent RDD is checkpointed. // Current implementation of UnionRDD has transient reference to parent RDDs, - // so only the splits will reduce in serialized size, not the RDD. + // so only the partitions will reduce in serialized size, not the RDD. testCheckpointing(_.union(otherRDD), false, true) testParentCheckpointing(_.union(otherRDD), false, true) } @@ -91,21 +91,21 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { testCheckpointing(new CartesianRDD(sc, _, otherRDD)) // Test whether size of CoalescedRDD reduce in size after parent RDD is checkpointed - // Current implementation of CoalescedRDDSplit has transient reference to parent RDD, - // so only the RDD will reduce in serialized size, not the splits. + // Current implementation of CoalescedRDDPartition has transient reference to parent RDD, + // so only the RDD will reduce in serialized size, not the partitions. testParentCheckpointing(new CartesianRDD(sc, _, otherRDD), true, false) - // Test that the CartesianRDD updates parent splits (CartesianRDD.s1/s2) after - // the parent RDD has been checkpointed and parent splits have been changed to HadoopSplits. + // Test that the CartesianRDD updates parent partitions (CartesianRDD.s1/s2) after + // the parent RDD has been checkpointed and parent partitions have been changed to HadoopPartitions. // Note that this test is very specific to the current implementation of CartesianRDD. val ones = sc.makeRDD(1 to 100, 10).map(x => x) ones.checkpoint() // checkpoint that MappedRDD val cartesian = new CartesianRDD(sc, ones, ones) val splitBeforeCheckpoint = - serializeDeserialize(cartesian.splits.head.asInstanceOf[CartesianSplit]) + serializeDeserialize(cartesian.partitions.head.asInstanceOf[CartesianPartition]) cartesian.count() // do the checkpointing val splitAfterCheckpoint = - serializeDeserialize(cartesian.splits.head.asInstanceOf[CartesianSplit]) + serializeDeserialize(cartesian.partitions.head.asInstanceOf[CartesianPartition]) assert( (splitAfterCheckpoint.s1 != splitBeforeCheckpoint.s1) && (splitAfterCheckpoint.s2 != splitBeforeCheckpoint.s2), @@ -117,24 +117,24 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { testCheckpointing(_.coalesce(2)) // Test whether size of CoalescedRDD reduce in size after parent RDD is checkpointed - // Current implementation of CoalescedRDDSplit has transient reference to parent RDD, - // so only the RDD will reduce in serialized size, not the splits. + // Current implementation of CoalescedRDDPartition has transient reference to parent RDD, + // so only the RDD will reduce in serialized size, not the partitions. testParentCheckpointing(_.coalesce(2), true, false) - // Test that the CoalescedRDDSplit updates parent splits (CoalescedRDDSplit.parents) after - // the parent RDD has been checkpointed and parent splits have been changed to HadoopSplits. - // Note that this test is very specific to the current implementation of CoalescedRDDSplits + // Test that the CoalescedRDDPartition updates parent partitions (CoalescedRDDPartition.parents) after + // the parent RDD has been checkpointed and parent partitions have been changed to HadoopPartitions. + // Note that this test is very specific to the current implementation of CoalescedRDDPartitions val ones = sc.makeRDD(1 to 100, 10).map(x => x) ones.checkpoint() // checkpoint that MappedRDD val coalesced = new CoalescedRDD(ones, 2) val splitBeforeCheckpoint = - serializeDeserialize(coalesced.splits.head.asInstanceOf[CoalescedRDDSplit]) + serializeDeserialize(coalesced.partitions.head.asInstanceOf[CoalescedRDDPartition]) coalesced.count() // do the checkpointing val splitAfterCheckpoint = - serializeDeserialize(coalesced.splits.head.asInstanceOf[CoalescedRDDSplit]) + serializeDeserialize(coalesced.partitions.head.asInstanceOf[CoalescedRDDPartition]) assert( splitAfterCheckpoint.parents.head != splitBeforeCheckpoint.parents.head, - "CoalescedRDDSplit.parents not updated after parent RDD checkpointed" + "CoalescedRDDPartition.parents not updated after parent RDD checkpointed" ) } @@ -156,8 +156,8 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { rdd => new ZippedRDD(sc, rdd, rdd.map(x => x)), true, false) // Test whether size of ZippedRDD reduce in size after parent RDD is checkpointed - // Current implementation of ZippedRDDSplit has transient references to parent RDDs, - // so only the RDD will reduce in serialized size, not the splits. + // Current implementation of ZippedRDDPartitions has transient references to parent RDDs, + // so only the RDD will reduce in serialized size, not the partitions. testParentCheckpointing( rdd => new ZippedRDD(sc, rdd, rdd.map(x => x)), true, false) } @@ -165,21 +165,21 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { /** * Test checkpointing of the final RDD generated by the given operation. By default, * this method tests whether the size of serialized RDD has reduced after checkpointing or not. - * It can also test whether the size of serialized RDD splits has reduced after checkpointing or - * not, but this is not done by default as usually the splits do not refer to any RDD and + * It can also test whether the size of serialized RDD partitions has reduced after checkpointing or + * not, but this is not done by default as usually the partitions do not refer to any RDD and * therefore never store the lineage. */ def testCheckpointing[U: ClassManifest]( op: (RDD[Int]) => RDD[U], testRDDSize: Boolean = true, - testRDDSplitSize: Boolean = false + testRDDPartitionSize: Boolean = false ) { // Generate the final RDD using given RDD operation val baseRDD = generateLongLineageRDD() val operatedRDD = op(baseRDD) val parentRDD = operatedRDD.dependencies.headOption.orNull val rddType = operatedRDD.getClass.getSimpleName - val numSplits = operatedRDD.splits.length + val numPartitions = operatedRDD.partitions.length // Find serialized sizes before and after the checkpoint val (rddSizeBeforeCheckpoint, splitSizeBeforeCheckpoint) = getSerializedSizes(operatedRDD) @@ -193,11 +193,11 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { // Test whether dependencies have been changed from its earlier parent RDD assert(operatedRDD.dependencies.head.rdd != parentRDD) - // Test whether the splits have been changed to the new Hadoop splits - assert(operatedRDD.splits.toList === operatedRDD.checkpointData.get.getSplits.toList) + // Test whether the partitions have been changed to the new Hadoop partitions + assert(operatedRDD.partitions.toList === operatedRDD.checkpointData.get.getPartitions.toList) - // Test whether the number of splits is same as before - assert(operatedRDD.splits.length === numSplits) + // Test whether the number of partitions is same as before + assert(operatedRDD.partitions.length === numPartitions) // Test whether the data in the checkpointed RDD is same as original assert(operatedRDD.collect() === result) @@ -215,18 +215,18 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { ) } - // Test whether serialized size of the splits has reduced. If the splits - // do not have any non-transient reference to another RDD or another RDD's splits, it + // Test whether serialized size of the partitions has reduced. If the partitions + // do not have any non-transient reference to another RDD or another RDD's partitions, it // does not refer to a lineage and therefore may not reduce in size after checkpointing. - // However, if the original splits before checkpointing do refer to a parent RDD, the splits + // However, if the original partitions before checkpointing do refer to a parent RDD, the partitions // must be forgotten after checkpointing (to remove all reference to parent RDDs) and - // replaced with the HadoopSplits of the checkpointed RDD. - if (testRDDSplitSize) { - logInfo("Size of " + rddType + " splits " + // replaced with the HadooPartitions of the checkpointed RDD. + if (testRDDPartitionSize) { + logInfo("Size of " + rddType + " partitions " + "[" + splitSizeBeforeCheckpoint + " --> " + splitSizeAfterCheckpoint + "]") assert( splitSizeAfterCheckpoint < splitSizeBeforeCheckpoint, - "Size of " + rddType + " splits did not reduce after checkpointing " + + "Size of " + rddType + " partitions did not reduce after checkpointing " + "[" + splitSizeBeforeCheckpoint + " --> " + splitSizeAfterCheckpoint + "]" ) } @@ -235,13 +235,13 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { /** * Test whether checkpointing of the parent of the generated RDD also * truncates the lineage or not. Some RDDs like CoGroupedRDD hold on to its parent - * RDDs splits. So even if the parent RDD is checkpointed and its splits changed, - * this RDD will remember the splits and therefore potentially the whole lineage. + * RDDs partitions. So even if the parent RDD is checkpointed and its partitions changed, + * this RDD will remember the partitions and therefore potentially the whole lineage. */ def testParentCheckpointing[U: ClassManifest]( op: (RDD[Int]) => RDD[U], testRDDSize: Boolean, - testRDDSplitSize: Boolean + testRDDPartitionSize: Boolean ) { // Generate the final RDD using given RDD operation val baseRDD = generateLongLineageRDD() @@ -250,9 +250,9 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { val rddType = operatedRDD.getClass.getSimpleName val parentRDDType = parentRDD.getClass.getSimpleName - // Get the splits and dependencies of the parent in case they're lazily computed + // Get the partitions and dependencies of the parent in case they're lazily computed parentRDD.dependencies - parentRDD.splits + parentRDD.partitions // Find serialized sizes before and after the checkpoint val (rddSizeBeforeCheckpoint, splitSizeBeforeCheckpoint) = getSerializedSizes(operatedRDD) @@ -275,16 +275,16 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { ) } - // Test whether serialized size of the splits has reduced because of its parent being - // checkpointed. If the splits do not have any non-transient reference to another RDD - // or another RDD's splits, it does not refer to a lineage and therefore may not reduce - // in size after checkpointing. However, if the splits do refer to the *splits* of a parent - // RDD, then these splits must update reference to the parent RDD splits as the parent RDD's - // splits must have changed after checkpointing. - if (testRDDSplitSize) { + // Test whether serialized size of the partitions has reduced because of its parent being + // checkpointed. If the partitions do not have any non-transient reference to another RDD + // or another RDD's partitions, it does not refer to a lineage and therefore may not reduce + // in size after checkpointing. However, if the partitions do refer to the *partitions* of a parent + // RDD, then these partitions must update reference to the parent RDD partitions as the parent RDD's + // partitions must have changed after checkpointing. + if (testRDDPartitionSize) { assert( splitSizeAfterCheckpoint < splitSizeBeforeCheckpoint, - "Size of " + rddType + " splits did not reduce after checkpointing parent " + parentRDDType + + "Size of " + rddType + " partitions did not reduce after checkpointing parent " + parentRDDType + "[" + splitSizeBeforeCheckpoint + " --> " + splitSizeAfterCheckpoint + "]" ) } @@ -321,12 +321,12 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { } /** - * Get serialized sizes of the RDD and its splits, in order to test whether the size shrinks + * Get serialized sizes of the RDD and its partitions, in order to test whether the size shrinks * upon checkpointing. Ignores the checkpointData field, which may grow when we checkpoint. */ def getSerializedSizes(rdd: RDD[_]): (Int, Int) = { (Utils.serialize(rdd).length - Utils.serialize(rdd.checkpointData).length, - Utils.serialize(rdd.splits).length) + Utils.serialize(rdd.partitions).length) } /** diff --git a/core/src/test/scala/spark/RDDSuite.scala b/core/src/test/scala/spark/RDDSuite.scala index ffa866de75..9739ba869b 100644 --- a/core/src/test/scala/spark/RDDSuite.scala +++ b/core/src/test/scala/spark/RDDSuite.scala @@ -33,6 +33,11 @@ class RDDSuite extends FunSuite with LocalSparkContext { } assert(partitionSumsWithSplit.collect().toList === List((0, 3), (1, 7))) + val partitionSumsWithIndex = nums.mapPartitionsWithIndex { + case(split, iter) => Iterator((split, iter.reduceLeft(_ + _))) + } + assert(partitionSumsWithIndex.collect().toList === List((0, 3), (1, 7))) + intercept[UnsupportedOperationException] { nums.filter(_ > 5).reduce(_ + _) } @@ -97,12 +102,12 @@ class RDDSuite extends FunSuite with LocalSparkContext { test("caching with failures") { sc = new SparkContext("local", "test") - val onlySplit = new Split { override def index: Int = 0 } + val onlySplit = new Partition { override def index: Int = 0 } var shouldFail = true val rdd = new RDD[Int](sc, Nil) { - override def getSplits: Array[Split] = Array(onlySplit) + override def getPartitions: Array[Partition] = Array(onlySplit) override val getDependencies = List[Dependency[_]]() - override def compute(split: Split, context: TaskContext): Iterator[Int] = { + override def compute(split: Partition, context: TaskContext): Iterator[Int] = { if (shouldFail) { throw new Exception("injected failure") } else { @@ -168,7 +173,7 @@ class RDDSuite extends FunSuite with LocalSparkContext { val data = sc.parallelize(1 to 10, 10) // Note that split number starts from 0, so > 8 means only 10th partition left. val prunedRdd = new PartitionPruningRDD(data, splitNum => splitNum > 8) - assert(prunedRdd.splits.size === 1) + assert(prunedRdd.partitions.size === 1) val prunedData = prunedRdd.collect() assert(prunedData.size === 1) assert(prunedData(0) === 10) diff --git a/core/src/test/scala/spark/ShuffleSuite.scala b/core/src/test/scala/spark/ShuffleSuite.scala index 50f2b294bf..92c3f67416 100644 --- a/core/src/test/scala/spark/ShuffleSuite.scala +++ b/core/src/test/scala/spark/ShuffleSuite.scala @@ -222,7 +222,7 @@ class ShuffleSuite extends FunSuite with ShouldMatchers with LocalSparkContext { sc = new SparkContext("local", "test") val emptyDir = Files.createTempDir() val file = sc.textFile(emptyDir.getAbsolutePath) - assert(file.splits.size == 0) + assert(file.partitions.size == 0) assert(file.collect().toList === Nil) // Test that a shuffle on the file works, because this used to be a bug assert(file.map(line => (line, 1)).reduceByKey(_ + _).collect().toList === Nil) diff --git a/core/src/test/scala/spark/SortingSuite.scala b/core/src/test/scala/spark/SortingSuite.scala index edb8c839fc..495f957e53 100644 --- a/core/src/test/scala/spark/SortingSuite.scala +++ b/core/src/test/scala/spark/SortingSuite.scala @@ -19,7 +19,7 @@ class SortingSuite extends FunSuite with LocalSparkContext with ShouldMatchers w val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } val pairs = sc.parallelize(pairArr, 2) val sorted = pairs.sortByKey() - assert(sorted.splits.size === 2) + assert(sorted.partitions.size === 2) assert(sorted.collect() === pairArr.sortBy(_._1)) } @@ -29,17 +29,17 @@ class SortingSuite extends FunSuite with LocalSparkContext with ShouldMatchers w val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } val pairs = sc.parallelize(pairArr, 2) val sorted = pairs.sortByKey(true, 1) - assert(sorted.splits.size === 1) + assert(sorted.partitions.size === 1) assert(sorted.collect() === pairArr.sortBy(_._1)) } - test("large array with many splits") { + test("large array with many partitions") { sc = new SparkContext("local", "test") val rand = new scala.util.Random() val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } val pairs = sc.parallelize(pairArr, 2) val sorted = pairs.sortByKey(true, 20) - assert(sorted.splits.size === 20) + assert(sorted.partitions.size === 20) assert(sorted.collect() === pairArr.sortBy(_._1)) } @@ -59,7 +59,7 @@ class SortingSuite extends FunSuite with LocalSparkContext with ShouldMatchers w assert(pairs.sortByKey(false, 1).collect() === pairArr.sortWith((x, y) => x._1 > y._1)) } - test("sort descending with many splits") { + test("sort descending with many partitions") { sc = new SparkContext("local", "test") val rand = new scala.util.Random() val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } diff --git a/core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala index 83663ac702..8de490eb86 100644 --- a/core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/spark/scheduler/DAGSchedulerSuite.scala @@ -24,7 +24,7 @@ import spark.MapOutputTracker import spark.RDD import spark.SparkContext import spark.SparkException -import spark.Split +import spark.Partition import spark.TaskContext import spark.TaskEndReason @@ -144,18 +144,18 @@ class DAGSchedulerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar * so we can test that DAGScheduler does not try to execute RDDs locally. */ def makeRdd( - numSplits: Int, + numPartitions: Int, dependencies: List[Dependency[_]], locations: Seq[Seq[String]] = Nil ): MyRDD = { - val maxSplit = numSplits - 1 + val maxPartition = numPartitions - 1 return new MyRDD(sc, dependencies) { - override def compute(split: Split, context: TaskContext): Iterator[(Int, Int)] = + override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = throw new RuntimeException("should not be reached") - override def getSplits() = (0 to maxSplit).map(i => new Split { + override def getPartitions = (0 to maxPartition).map(i => new Partition { override def index = i }).toArray - override def getPreferredLocations(split: Split): Seq[String] = + override def getPreferredLocations(split: Partition): Seq[String] = if (locations.isDefinedAt(split.index)) locations(split.index) else @@ -295,11 +295,11 @@ class DAGSchedulerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar * collect the result of the job via callbacks from DAGScheduler. */ def submitRdd(rdd: MyRDD, allowLocal: Boolean = false): (JobWaiter[Int], Array[Int]) = { - val resultArray = new Array[Int](rdd.splits.size) + val resultArray = new Array[Int](rdd.partitions.size) val (toSubmit, waiter) = scheduler.prepareJob[(Int, Int), Int]( rdd, jobComputeFunc, - (0 to (rdd.splits.size - 1)), + (0 to (rdd.partitions.size - 1)), "test-site", allowLocal, (i: Int, value: Int) => resultArray(i) = value @@ -355,10 +355,10 @@ class DAGSchedulerSuite extends FunSuite with BeforeAndAfter with EasyMockSugar test("local job") { val rdd = new MyRDD(sc, Nil) { - override def compute(split: Split, context: TaskContext): Iterator[(Int, Int)] = + override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] = Array(42 -> 0).iterator - override def getSplits() = Array( new Split { override def index = 0 } ) - override def getPreferredLocations(split: Split) = Nil + override def getPartitions = Array( new Partition { override def index = 0 } ) + override def getPreferredLocations(split: Partition) = Nil override def toString = "DAGSchedulerSuite Local RDD" } submitRdd(rdd, true) diff --git a/core/src/test/scala/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/spark/scheduler/TaskContextSuite.scala index a5db7103f5..647bcaf860 100644 --- a/core/src/test/scala/spark/scheduler/TaskContextSuite.scala +++ b/core/src/test/scala/spark/scheduler/TaskContextSuite.scala @@ -5,7 +5,7 @@ import org.scalatest.BeforeAndAfter import spark.TaskContext import spark.RDD import spark.SparkContext -import spark.Split +import spark.Partition import spark.LocalSparkContext class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkContext { @@ -14,8 +14,8 @@ class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkConte var completed = false sc = new SparkContext("local", "test") val rdd = new RDD[String](sc, List()) { - override def getSplits = Array[Split](StubSplit(0)) - override def compute(split: Split, context: TaskContext) = { + override def getPartitions = Array[Partition](StubPartition(0)) + override def compute(split: Partition, context: TaskContext) = { context.addOnCompleteCallback(() => completed = true) sys.error("failed") } @@ -28,5 +28,5 @@ class TaskContextSuite extends FunSuite with BeforeAndAfter with LocalSparkConte assert(completed === true) } - case class StubSplit(val index: Int) extends Split -} \ No newline at end of file + case class StubPartition(val index: Int) extends Partition +} -- cgit v1.2.3 From 7151e1e4c8f4f764c54047ef82b988f887a0b9c7 Mon Sep 17 00:00:00 2001 From: Matei Zaharia Date: Sun, 17 Feb 2013 23:23:08 -0800 Subject: Rename "jobs" to "applications" in the standalone cluster --- core/src/main/scala/spark/SparkContext.scala | 12 +- .../scala/spark/api/java/JavaSparkContext.scala | 22 +-- .../scala/spark/api/python/PythonPartitioner.scala | 2 +- .../spark/deploy/ApplicationDescription.scala | 14 ++ .../main/scala/spark/deploy/DeployMessage.scala | 19 +-- .../main/scala/spark/deploy/JobDescription.scala | 14 -- .../src/main/scala/spark/deploy/JsonProtocol.scala | 18 +-- .../main/scala/spark/deploy/client/Client.scala | 22 +-- .../scala/spark/deploy/client/ClientListener.scala | 2 +- .../scala/spark/deploy/client/TestClient.scala | 6 +- .../spark/deploy/master/ApplicationInfo.scala | 63 ++++++++ .../spark/deploy/master/ApplicationState.scala | 11 ++ .../scala/spark/deploy/master/ExecutorInfo.scala | 4 +- .../main/scala/spark/deploy/master/JobInfo.scala | 63 -------- .../main/scala/spark/deploy/master/JobState.scala | 9 -- .../main/scala/spark/deploy/master/Master.scala | 174 ++++++++++----------- .../scala/spark/deploy/master/MasterWebUI.scala | 22 +-- .../scala/spark/deploy/master/WorkerInfo.scala | 4 +- .../scala/spark/deploy/worker/ExecutorRunner.scala | 26 +-- .../main/scala/spark/deploy/worker/Worker.scala | 20 +-- .../spark/deploy/worker/WorkerArguments.scala | 2 +- .../scala/spark/deploy/worker/WorkerWebUI.scala | 4 +- .../cluster/SparkDeploySchedulerBackend.scala | 15 +- .../mesos/CoarseMesosSchedulerBackend.scala | 4 +- .../scheduler/mesos/MesosSchedulerBackend.scala | 4 +- .../spark/deploy/master/app_details.scala.html | 40 +++++ .../twirl/spark/deploy/master/app_row.scala.html | 20 +++ .../twirl/spark/deploy/master/app_table.scala.html | 21 +++ .../spark/deploy/master/executor_row.scala.html | 6 +- .../twirl/spark/deploy/master/index.scala.html | 16 +- .../spark/deploy/master/job_details.scala.html | 40 ----- .../twirl/spark/deploy/master/job_row.scala.html | 20 --- .../twirl/spark/deploy/master/job_table.scala.html | 21 --- .../spark/deploy/worker/executor_row.scala.html | 10 +- .../main/scala/spark/streaming/Checkpoint.scala | 2 +- .../scala/spark/streaming/StreamingContext.scala | 10 +- .../streaming/api/java/JavaStreamingContext.scala | 6 +- 37 files changed, 386 insertions(+), 382 deletions(-) create mode 100644 core/src/main/scala/spark/deploy/ApplicationDescription.scala delete mode 100644 core/src/main/scala/spark/deploy/JobDescription.scala create mode 100644 core/src/main/scala/spark/deploy/master/ApplicationInfo.scala create mode 100644 core/src/main/scala/spark/deploy/master/ApplicationState.scala delete mode 100644 core/src/main/scala/spark/deploy/master/JobInfo.scala delete mode 100644 core/src/main/scala/spark/deploy/master/JobState.scala create mode 100644 core/src/main/twirl/spark/deploy/master/app_details.scala.html create mode 100644 core/src/main/twirl/spark/deploy/master/app_row.scala.html create mode 100644 core/src/main/twirl/spark/deploy/master/app_table.scala.html delete mode 100644 core/src/main/twirl/spark/deploy/master/job_details.scala.html delete mode 100644 core/src/main/twirl/spark/deploy/master/job_row.scala.html delete mode 100644 core/src/main/twirl/spark/deploy/master/job_table.scala.html diff --git a/core/src/main/scala/spark/SparkContext.scala b/core/src/main/scala/spark/SparkContext.scala index f299b7ea46..d39767c3b3 100644 --- a/core/src/main/scala/spark/SparkContext.scala +++ b/core/src/main/scala/spark/SparkContext.scala @@ -53,7 +53,7 @@ import storage.{StorageStatus, StorageUtils, RDDInfo} * cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster. * * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI. + * @param appName A name for your application, to display on the cluster web UI. * @param sparkHome Location where Spark is installed on cluster nodes. * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. @@ -61,7 +61,7 @@ import storage.{StorageStatus, StorageUtils, RDDInfo} */ class SparkContext( val master: String, - val jobName: String, + val appName: String, val sparkHome: String = null, val jars: Seq[String] = Nil, environment: Map[String, String] = Map()) @@ -143,7 +143,7 @@ class SparkContext( case SPARK_REGEX(sparkUrl) => val scheduler = new ClusterScheduler(this) - val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, jobName) + val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, appName) scheduler.initialize(backend) scheduler @@ -162,7 +162,7 @@ class SparkContext( val localCluster = new LocalSparkCluster( numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt) val sparkUrl = localCluster.start() - val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, jobName) + val backend = new SparkDeploySchedulerBackend(scheduler, this, sparkUrl, appName) scheduler.initialize(backend) backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => { localCluster.stop() @@ -178,9 +178,9 @@ class SparkContext( val coarseGrained = System.getProperty("spark.mesos.coarse", "false").toBoolean val masterWithoutProtocol = master.replaceFirst("^mesos://", "") // Strip initial mesos:// val backend = if (coarseGrained) { - new CoarseMesosSchedulerBackend(scheduler, this, masterWithoutProtocol, jobName) + new CoarseMesosSchedulerBackend(scheduler, this, masterWithoutProtocol, appName) } else { - new MesosSchedulerBackend(scheduler, this, masterWithoutProtocol, jobName) + new MesosSchedulerBackend(scheduler, this, masterWithoutProtocol, appName) } scheduler.initialize(backend) scheduler diff --git a/core/src/main/scala/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/spark/api/java/JavaSparkContext.scala index 50b8970cd8..f75fc27c7b 100644 --- a/core/src/main/scala/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/spark/api/java/JavaSparkContext.scala @@ -23,41 +23,41 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI */ - def this(master: String, jobName: String) = this(new SparkContext(master, jobName)) + def this(master: String, appName: String) = this(new SparkContext(master, appName)) /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI * @param sparkHome The SPARK_HOME directory on the slave nodes * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. */ - def this(master: String, jobName: String, sparkHome: String, jarFile: String) = - this(new SparkContext(master, jobName, sparkHome, Seq(jarFile))) + def this(master: String, appName: String, sparkHome: String, jarFile: String) = + this(new SparkContext(master, appName, sparkHome, Seq(jarFile))) /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI * @param sparkHome The SPARK_HOME directory on the slave nodes * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. */ - def this(master: String, jobName: String, sparkHome: String, jars: Array[String]) = - this(new SparkContext(master, jobName, sparkHome, jars.toSeq)) + def this(master: String, appName: String, sparkHome: String, jars: Array[String]) = + this(new SparkContext(master, appName, sparkHome, jars.toSeq)) /** * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param jobName A name for your job, to display on the cluster web UI + * @param appName A name for your application, to display on the cluster web UI * @param sparkHome The SPARK_HOME directory on the slave nodes * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. * @param environment Environment variables to set on worker nodes */ - def this(master: String, jobName: String, sparkHome: String, jars: Array[String], + def this(master: String, appName: String, sparkHome: String, jars: Array[String], environment: JMap[String, String]) = - this(new SparkContext(master, jobName, sparkHome, jars.toSeq, environment)) + this(new SparkContext(master, appName, sparkHome, jars.toSeq, environment)) private[spark] val env = sc.env diff --git a/core/src/main/scala/spark/api/python/PythonPartitioner.scala b/core/src/main/scala/spark/api/python/PythonPartitioner.scala index 519e310323..d618c098c2 100644 --- a/core/src/main/scala/spark/api/python/PythonPartitioner.scala +++ b/core/src/main/scala/spark/api/python/PythonPartitioner.scala @@ -9,7 +9,7 @@ import java.util.Arrays * * Stores the unique id() of the Python-side partitioning function so that it is incorporated into * equality comparisons. Correctness requires that the id is a unique identifier for the - * lifetime of the job (i.e. that it is not re-used as the id of a different partitioning + * lifetime of the program (i.e. that it is not re-used as the id of a different partitioning * function). This can be ensured by using the Python id() function and maintaining a reference * to the Python partitioning function so that its id() is not reused. */ diff --git a/core/src/main/scala/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/spark/deploy/ApplicationDescription.scala new file mode 100644 index 0000000000..6659e53b25 --- /dev/null +++ b/core/src/main/scala/spark/deploy/ApplicationDescription.scala @@ -0,0 +1,14 @@ +package spark.deploy + +private[spark] class ApplicationDescription( + val name: String, + val cores: Int, + val memoryPerSlave: Int, + val command: Command, + val sparkHome: String) + extends Serializable { + + val user = System.getProperty("user.name", "") + + override def toString: String = "ApplicationDescription(" + name + ")" +} diff --git a/core/src/main/scala/spark/deploy/DeployMessage.scala b/core/src/main/scala/spark/deploy/DeployMessage.scala index 1d88d4bc84..3cbf4fdd98 100644 --- a/core/src/main/scala/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/spark/deploy/DeployMessage.scala @@ -1,7 +1,7 @@ package spark.deploy import spark.deploy.ExecutorState.ExecutorState -import spark.deploy.master.{WorkerInfo, JobInfo} +import spark.deploy.master.{WorkerInfo, ApplicationInfo} import spark.deploy.worker.ExecutorRunner import scala.collection.immutable.List @@ -23,7 +23,7 @@ case class RegisterWorker( private[spark] case class ExecutorStateChanged( - jobId: String, + appId: String, execId: Int, state: ExecutorState, message: Option[String], @@ -36,12 +36,12 @@ private[spark] case class Heartbeat(workerId: String) extends DeployMessage private[spark] case class RegisteredWorker(masterWebUiUrl: String) extends DeployMessage private[spark] case class RegisterWorkerFailed(message: String) extends DeployMessage -private[spark] case class KillExecutor(jobId: String, execId: Int) extends DeployMessage +private[spark] case class KillExecutor(appId: String, execId: Int) extends DeployMessage private[spark] case class LaunchExecutor( - jobId: String, + appId: String, execId: Int, - jobDesc: JobDescription, + appDesc: ApplicationDescription, cores: Int, memory: Int, sparkHome: String) @@ -49,12 +49,13 @@ private[spark] case class LaunchExecutor( // Client to Master -private[spark] case class RegisterJob(jobDescription: JobDescription) extends DeployMessage +private[spark] case class RegisterApplication(appDescription: ApplicationDescription) + extends DeployMessage // Master to Client private[spark] -case class RegisteredJob(jobId: String) extends DeployMessage +case class RegisteredApplication(appId: String) extends DeployMessage private[spark] case class ExecutorAdded(id: Int, workerId: String, host: String, cores: Int, memory: Int) @@ -64,7 +65,7 @@ case class ExecutorUpdated(id: Int, state: ExecutorState, message: Option[String exitStatus: Option[Int]) private[spark] -case class JobKilled(message: String) +case class appKilled(message: String) // Internal message in Client @@ -78,7 +79,7 @@ private[spark] case object RequestMasterState private[spark] case class MasterState(host: String, port: Int, workers: Array[WorkerInfo], - activeJobs: Array[JobInfo], completedJobs: Array[JobInfo]) { + activeApps: Array[ApplicationInfo], completedApps: Array[ApplicationInfo]) { def uri = "spark://" + host + ":" + port } diff --git a/core/src/main/scala/spark/deploy/JobDescription.scala b/core/src/main/scala/spark/deploy/JobDescription.scala deleted file mode 100644 index 7160fc05fc..0000000000 --- a/core/src/main/scala/spark/deploy/JobDescription.scala +++ /dev/null @@ -1,14 +0,0 @@ -package spark.deploy - -private[spark] class JobDescription( - val name: String, - val cores: Int, - val memoryPerSlave: Int, - val command: Command, - val sparkHome: String) - extends Serializable { - - val user = System.getProperty("user.name", "") - - override def toString: String = "JobDescription(" + name + ")" -} diff --git a/core/src/main/scala/spark/deploy/JsonProtocol.scala b/core/src/main/scala/spark/deploy/JsonProtocol.scala index 732fa08064..38a6ebfc24 100644 --- a/core/src/main/scala/spark/deploy/JsonProtocol.scala +++ b/core/src/main/scala/spark/deploy/JsonProtocol.scala @@ -1,6 +1,6 @@ package spark.deploy -import master.{JobInfo, WorkerInfo} +import master.{ApplicationInfo, WorkerInfo} import worker.ExecutorRunner import cc.spray.json._ @@ -20,8 +20,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { ) } - implicit object JobInfoJsonFormat extends RootJsonWriter[JobInfo] { - def write(obj: JobInfo) = JsObject( + implicit object AppInfoJsonFormat extends RootJsonWriter[ApplicationInfo] { + def write(obj: ApplicationInfo) = JsObject( "starttime" -> JsNumber(obj.startTime), "id" -> JsString(obj.id), "name" -> JsString(obj.desc.name), @@ -31,8 +31,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { "submitdate" -> JsString(obj.submitDate.toString)) } - implicit object JobDescriptionJsonFormat extends RootJsonWriter[JobDescription] { - def write(obj: JobDescription) = JsObject( + implicit object AppDescriptionJsonFormat extends RootJsonWriter[ApplicationDescription] { + def write(obj: ApplicationDescription) = JsObject( "name" -> JsString(obj.name), "cores" -> JsNumber(obj.cores), "memoryperslave" -> JsNumber(obj.memoryPerSlave), @@ -44,8 +44,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { def write(obj: ExecutorRunner) = JsObject( "id" -> JsNumber(obj.execId), "memory" -> JsNumber(obj.memory), - "jobid" -> JsString(obj.jobId), - "jobdesc" -> obj.jobDesc.toJson.asJsObject + "appid" -> JsString(obj.appId), + "appdesc" -> obj.appDesc.toJson.asJsObject ) } @@ -57,8 +57,8 @@ private[spark] object JsonProtocol extends DefaultJsonProtocol { "coresused" -> JsNumber(obj.workers.map(_.coresUsed).sum), "memory" -> JsNumber(obj.workers.map(_.memory).sum), "memoryused" -> JsNumber(obj.workers.map(_.memoryUsed).sum), - "activejobs" -> JsArray(obj.activeJobs.toList.map(_.toJson)), - "completedjobs" -> JsArray(obj.completedJobs.toList.map(_.toJson)) + "activeapps" -> JsArray(obj.activeApps.toList.map(_.toJson)), + "completedapps" -> JsArray(obj.completedApps.toList.map(_.toJson)) ) } diff --git a/core/src/main/scala/spark/deploy/client/Client.scala b/core/src/main/scala/spark/deploy/client/Client.scala index e01181d1b2..1a95524cf9 100644 --- a/core/src/main/scala/spark/deploy/client/Client.scala +++ b/core/src/main/scala/spark/deploy/client/Client.scala @@ -8,25 +8,25 @@ import akka.pattern.AskTimeoutException import spark.{SparkException, Logging} import akka.remote.RemoteClientLifeCycleEvent import akka.remote.RemoteClientShutdown -import spark.deploy.RegisterJob +import spark.deploy.RegisterApplication import spark.deploy.master.Master import akka.remote.RemoteClientDisconnected import akka.actor.Terminated import akka.dispatch.Await /** - * The main class used to talk to a Spark deploy cluster. Takes a master URL, a job description, - * and a listener for job events, and calls back the listener when various events occur. + * The main class used to talk to a Spark deploy cluster. Takes a master URL, an app description, + * and a listener for cluster events, and calls back the listener when various events occur. */ private[spark] class Client( actorSystem: ActorSystem, masterUrl: String, - jobDescription: JobDescription, + appDescription: ApplicationDescription, listener: ClientListener) extends Logging { var actor: ActorRef = null - var jobId: String = null + var appId: String = null class ClientActor extends Actor with Logging { var master: ActorRef = null @@ -38,7 +38,7 @@ private[spark] class Client( try { master = context.actorFor(Master.toAkkaUrl(masterUrl)) masterAddress = master.path.address - master ! RegisterJob(jobDescription) + master ! RegisterApplication(appDescription) context.system.eventStream.subscribe(self, classOf[RemoteClientLifeCycleEvent]) context.watch(master) // Doesn't work with remote actors, but useful for testing } catch { @@ -50,17 +50,17 @@ private[spark] class Client( } override def receive = { - case RegisteredJob(jobId_) => - jobId = jobId_ - listener.connected(jobId) + case RegisteredApplication(appId_) => + appId = appId_ + listener.connected(appId) case ExecutorAdded(id: Int, workerId: String, host: String, cores: Int, memory: Int) => - val fullId = jobId + "/" + id + val fullId = appId + "/" + id logInfo("Executor added: %s on %s (%s) with %d cores".format(fullId, workerId, host, cores)) listener.executorAdded(fullId, workerId, host, cores, memory) case ExecutorUpdated(id, state, message, exitStatus) => - val fullId = jobId + "/" + id + val fullId = appId + "/" + id val messageText = message.map(s => " (" + s + ")").getOrElse("") logInfo("Executor updated: %s is now %s%s".format(fullId, state, messageText)) if (ExecutorState.isFinished(state)) { diff --git a/core/src/main/scala/spark/deploy/client/ClientListener.scala b/core/src/main/scala/spark/deploy/client/ClientListener.scala index 7035f4b394..b7008321df 100644 --- a/core/src/main/scala/spark/deploy/client/ClientListener.scala +++ b/core/src/main/scala/spark/deploy/client/ClientListener.scala @@ -8,7 +8,7 @@ package spark.deploy.client * Users of this API should *not* block inside the callback methods. */ private[spark] trait ClientListener { - def connected(jobId: String): Unit + def connected(appId: String): Unit def disconnected(): Unit diff --git a/core/src/main/scala/spark/deploy/client/TestClient.scala b/core/src/main/scala/spark/deploy/client/TestClient.scala index 8764c400e2..dc004b59ca 100644 --- a/core/src/main/scala/spark/deploy/client/TestClient.scala +++ b/core/src/main/scala/spark/deploy/client/TestClient.scala @@ -2,13 +2,13 @@ package spark.deploy.client import spark.util.AkkaUtils import spark.{Logging, Utils} -import spark.deploy.{Command, JobDescription} +import spark.deploy.{Command, ApplicationDescription} private[spark] object TestClient { class TestListener extends ClientListener with Logging { def connected(id: String) { - logInfo("Connected to master, got job ID " + id) + logInfo("Connected to master, got app ID " + id) } def disconnected() { @@ -24,7 +24,7 @@ private[spark] object TestClient { def main(args: Array[String]) { val url = args(0) val (actorSystem, port) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0) - val desc = new JobDescription( + val desc = new ApplicationDescription( "TestClient", 1, 512, Command("spark.deploy.client.TestExecutor", Seq(), Map()), "dummy-spark-home") val listener = new TestListener val client = new Client(actorSystem, url, desc, listener) diff --git a/core/src/main/scala/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/spark/deploy/master/ApplicationInfo.scala new file mode 100644 index 0000000000..3591a94072 --- /dev/null +++ b/core/src/main/scala/spark/deploy/master/ApplicationInfo.scala @@ -0,0 +1,63 @@ +package spark.deploy.master + +import spark.deploy.ApplicationDescription +import java.util.Date +import akka.actor.ActorRef +import scala.collection.mutable + +private[spark] class ApplicationInfo( + val startTime: Long, + val id: String, + val desc: ApplicationDescription, + val submitDate: Date, + val driver: ActorRef) +{ + var state = ApplicationState.WAITING + var executors = new mutable.HashMap[Int, ExecutorInfo] + var coresGranted = 0 + var endTime = -1L + + private var nextExecutorId = 0 + + def newExecutorId(): Int = { + val id = nextExecutorId + nextExecutorId += 1 + id + } + + def addExecutor(worker: WorkerInfo, cores: Int): ExecutorInfo = { + val exec = new ExecutorInfo(newExecutorId(), this, worker, cores, desc.memoryPerSlave) + executors(exec.id) = exec + coresGranted += cores + exec + } + + def removeExecutor(exec: ExecutorInfo) { + executors -= exec.id + coresGranted -= exec.cores + } + + def coresLeft: Int = desc.cores - coresGranted + + private var _retryCount = 0 + + def retryCount = _retryCount + + def incrementRetryCount = { + _retryCount += 1 + _retryCount + } + + def markFinished(endState: ApplicationState.Value) { + state = endState + endTime = System.currentTimeMillis() + } + + def duration: Long = { + if (endTime != -1) { + endTime - startTime + } else { + System.currentTimeMillis() - startTime + } + } +} diff --git a/core/src/main/scala/spark/deploy/master/ApplicationState.scala b/core/src/main/scala/spark/deploy/master/ApplicationState.scala new file mode 100644 index 0000000000..15016b388d --- /dev/null +++ b/core/src/main/scala/spark/deploy/master/ApplicationState.scala @@ -0,0 +1,11 @@ +package spark.deploy.master + +private[spark] object ApplicationState + extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED") { + + type ApplicationState = Value + + val WAITING, RUNNING, FINISHED, FAILED = Value + + val MAX_NUM_RETRY = 10 +} diff --git a/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala b/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala index 1db2c32633..48e6055fb5 100644 --- a/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala +++ b/core/src/main/scala/spark/deploy/master/ExecutorInfo.scala @@ -4,12 +4,12 @@ import spark.deploy.ExecutorState private[spark] class ExecutorInfo( val id: Int, - val job: JobInfo, + val application: ApplicationInfo, val worker: WorkerInfo, val cores: Int, val memory: Int) { var state = ExecutorState.LAUNCHING - def fullId: String = job.id + "/" + id + def fullId: String = application.id + "/" + id } diff --git a/core/src/main/scala/spark/deploy/master/JobInfo.scala b/core/src/main/scala/spark/deploy/master/JobInfo.scala deleted file mode 100644 index a274b21c34..0000000000 --- a/core/src/main/scala/spark/deploy/master/JobInfo.scala +++ /dev/null @@ -1,63 +0,0 @@ -package spark.deploy.master - -import spark.deploy.JobDescription -import java.util.Date -import akka.actor.ActorRef -import scala.collection.mutable - -private[spark] class JobInfo( - val startTime: Long, - val id: String, - val desc: JobDescription, - val submitDate: Date, - val driver: ActorRef) -{ - var state = JobState.WAITING - var executors = new mutable.HashMap[Int, ExecutorInfo] - var coresGranted = 0 - var endTime = -1L - - private var nextExecutorId = 0 - - def newExecutorId(): Int = { - val id = nextExecutorId - nextExecutorId += 1 - id - } - - def addExecutor(worker: WorkerInfo, cores: Int): ExecutorInfo = { - val exec = new ExecutorInfo(newExecutorId(), this, worker, cores, desc.memoryPerSlave) - executors(exec.id) = exec - coresGranted += cores - exec - } - - def removeExecutor(exec: ExecutorInfo) { - executors -= exec.id - coresGranted -= exec.cores - } - - def coresLeft: Int = desc.cores - coresGranted - - private var _retryCount = 0 - - def retryCount = _retryCount - - def incrementRetryCount = { - _retryCount += 1 - _retryCount - } - - def markFinished(endState: JobState.Value) { - state = endState - endTime = System.currentTimeMillis() - } - - def duration: Long = { - if (endTime != -1) { - endTime - startTime - } else { - System.currentTimeMillis() - startTime - } - } -} diff --git a/core/src/main/scala/spark/deploy/master/JobState.scala b/core/src/main/scala/spark/deploy/master/JobState.scala deleted file mode 100644 index 2b70cf0191..0000000000 --- a/core/src/main/scala/spark/deploy/master/JobState.scala +++ /dev/null @@ -1,9 +0,0 @@ -package spark.deploy.master - -private[spark] object JobState extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED") { - type JobState = Value - - val WAITING, RUNNING, FINISHED, FAILED = Value - - val MAX_NUM_RETRY = 10 -} diff --git a/core/src/main/scala/spark/deploy/master/Master.scala b/core/src/main/scala/spark/deploy/master/Master.scala index a5de23261c..1cd68a2aa6 100644 --- a/core/src/main/scala/spark/deploy/master/Master.scala +++ b/core/src/main/scala/spark/deploy/master/Master.scala @@ -16,22 +16,22 @@ import spark.util.AkkaUtils private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging { - val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For job IDs + val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For application IDs val WORKER_TIMEOUT = System.getProperty("spark.worker.timeout", "60").toLong * 1000 - var nextJobNumber = 0 + var nextAppNumber = 0 val workers = new HashSet[WorkerInfo] val idToWorker = new HashMap[String, WorkerInfo] val actorToWorker = new HashMap[ActorRef, WorkerInfo] val addressToWorker = new HashMap[Address, WorkerInfo] - val jobs = new HashSet[JobInfo] - val idToJob = new HashMap[String, JobInfo] - val actorToJob = new HashMap[ActorRef, JobInfo] - val addressToJob = new HashMap[Address, JobInfo] + val apps = new HashSet[ApplicationInfo] + val idToApp = new HashMap[String, ApplicationInfo] + val actorToApp = new HashMap[ActorRef, ApplicationInfo] + val addressToApp = new HashMap[Address, ApplicationInfo] - val waitingJobs = new ArrayBuffer[JobInfo] - val completedJobs = new ArrayBuffer[JobInfo] + val waitingApps = new ArrayBuffer[ApplicationInfo] + val completedApps = new ArrayBuffer[ApplicationInfo] val masterPublicAddress = { val envVar = System.getenv("SPARK_PUBLIC_DNS") @@ -39,9 +39,9 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } // As a temporary workaround before better ways of configuring memory, we allow users to set - // a flag that will perform round-robin scheduling across the nodes (spreading out each job - // among all the nodes) instead of trying to consolidate each job onto a small # of nodes. - val spreadOutJobs = System.getProperty("spark.deploy.spreadOut", "false").toBoolean + // a flag that will perform round-robin scheduling across the nodes (spreading out each app + // among all the nodes) instead of trying to consolidate each app onto a small # of nodes. + val spreadOutApps = System.getProperty("spark.deploy.spreadOut", "false").toBoolean override def preStart() { logInfo("Starting Spark master at spark://" + ip + ":" + port) @@ -76,41 +76,41 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } } - case RegisterJob(description) => { - logInfo("Registering job " + description.name) - val job = addJob(description, sender) - logInfo("Registered job " + description.name + " with ID " + job.id) - waitingJobs += job + case RegisterApplication(description) => { + logInfo("Registering app " + description.name) + val app = addApplication(description, sender) + logInfo("Registered app " + description.name + " with ID " + app.id) + waitingApps += app context.watch(sender) // This doesn't work with remote actors but helps for testing - sender ! RegisteredJob(job.id) + sender ! RegisteredApplication(app.id) schedule() } - case ExecutorStateChanged(jobId, execId, state, message, exitStatus) => { - val execOption = idToJob.get(jobId).flatMap(job => job.executors.get(execId)) + case ExecutorStateChanged(appId, execId, state, message, exitStatus) => { + val execOption = idToApp.get(appId).flatMap(app => app.executors.get(execId)) execOption match { case Some(exec) => { exec.state = state - exec.job.driver ! ExecutorUpdated(execId, state, message, exitStatus) + exec.application.driver ! ExecutorUpdated(execId, state, message, exitStatus) if (ExecutorState.isFinished(state)) { - val jobInfo = idToJob(jobId) - // Remove this executor from the worker and job + val appInfo = idToApp(appId) + // Remove this executor from the worker and app logInfo("Removing executor " + exec.fullId + " because it is " + state) - jobInfo.removeExecutor(exec) + appInfo.removeExecutor(exec) exec.worker.removeExecutor(exec) // Only retry certain number of times so we don't go into an infinite loop. - if (jobInfo.incrementRetryCount < JobState.MAX_NUM_RETRY) { + if (appInfo.incrementRetryCount < ApplicationState.MAX_NUM_RETRY) { schedule() } else { - logError("Job %s with ID %s failed %d times, removing it".format( - jobInfo.desc.name, jobInfo.id, jobInfo.retryCount)) - removeJob(jobInfo) + logError("Application %s with ID %s failed %d times, removing it".format( + appInfo.desc.name, appInfo.id, appInfo.retryCount)) + removeApplication(appInfo) } } } case None => - logWarning("Got status update for unknown executor " + jobId + "/" + execId) + logWarning("Got status update for unknown executor " + appId + "/" + execId) } } @@ -124,53 +124,53 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor } case Terminated(actor) => { - // The disconnected actor could've been either a worker or a job; remove whichever of + // The disconnected actor could've been either a worker or an app; remove whichever of // those we have an entry for in the corresponding actor hashmap actorToWorker.get(actor).foreach(removeWorker) - actorToJob.get(actor).foreach(removeJob) + actorToApp.get(actor).foreach(removeApplication) } case RemoteClientDisconnected(transport, address) => { - // The disconnected client could've been either a worker or a job; remove whichever it was + // The disconnected client could've been either a worker or an app; remove whichever it was addressToWorker.get(address).foreach(removeWorker) - addressToJob.get(address).foreach(removeJob) + addressToApp.get(address).foreach(removeApplication) } case RemoteClientShutdown(transport, address) => { - // The disconnected client could've been either a worker or a job; remove whichever it was + // The disconnected client could've been either a worker or an app; remove whichever it was addressToWorker.get(address).foreach(removeWorker) - addressToJob.get(address).foreach(removeJob) + addressToApp.get(address).foreach(removeApplication) } case RequestMasterState => { - sender ! MasterState(ip, port, workers.toArray, jobs.toArray, completedJobs.toArray) + sender ! MasterState(ip, port, workers.toArray, apps.toArray, completedApps.toArray) } } /** - * Can a job use the given worker? True if the worker has enough memory and we haven't already - * launched an executor for the job on it (right now the standalone backend doesn't like having + * Can an app use the given worker? True if the worker has enough memory and we haven't already + * launched an executor for the app on it (right now the standalone backend doesn't like having * two executors on the same worker). */ - def canUse(job: JobInfo, worker: WorkerInfo): Boolean = { - worker.memoryFree >= job.desc.memoryPerSlave && !worker.hasExecutor(job) + def canUse(app: ApplicationInfo, worker: WorkerInfo): Boolean = { + worker.memoryFree >= app.desc.memoryPerSlave && !worker.hasExecutor(app) } /** - * Schedule the currently available resources among waiting jobs. This method will be called - * every time a new job joins or resource availability changes. + * Schedule the currently available resources among waiting apps. This method will be called + * every time a new app joins or resource availability changes. */ def schedule() { - // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first job - // in the queue, then the second job, etc. - if (spreadOutJobs) { - // Try to spread out each job among all the nodes, until it has all its cores - for (job <- waitingJobs if job.coresLeft > 0) { + // Right now this is a very simple FIFO scheduler. We keep trying to fit in the first app + // in the queue, then the second app, etc. + if (spreadOutApps) { + // Try to spread out each app among all the nodes, until it has all its cores + for (app <- waitingApps if app.coresLeft > 0) { val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE) - .filter(canUse(job, _)).sortBy(_.coresFree).reverse + .filter(canUse(app, _)).sortBy(_.coresFree).reverse val numUsable = usableWorkers.length val assigned = new Array[Int](numUsable) // Number of cores to give on each node - var toAssign = math.min(job.coresLeft, usableWorkers.map(_.coresFree).sum) + var toAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum) var pos = 0 while (toAssign > 0) { if (usableWorkers(pos).coresFree - assigned(pos) > 0) { @@ -182,22 +182,22 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor // Now that we've decided how many cores to give on each node, let's actually give them for (pos <- 0 until numUsable) { if (assigned(pos) > 0) { - val exec = job.addExecutor(usableWorkers(pos), assigned(pos)) - launchExecutor(usableWorkers(pos), exec, job.desc.sparkHome) - job.state = JobState.RUNNING + val exec = app.addExecutor(usableWorkers(pos), assigned(pos)) + launchExecutor(usableWorkers(pos), exec, app.desc.sparkHome) + app.state = ApplicationState.RUNNING } } } } else { - // Pack each job into as few nodes as possible until we've assigned all its cores + // Pack each app into as few nodes as possible until we've assigned all its cores for (worker <- workers if worker.coresFree > 0) { - for (job <- waitingJobs if job.coresLeft > 0) { - if (canUse(job, worker)) { - val coresToUse = math.min(worker.coresFree, job.coresLeft) + for (app <- waitingApps if app.coresLeft > 0) { + if (canUse(app, worker)) { + val coresToUse = math.min(worker.coresFree, app.coresLeft) if (coresToUse > 0) { - val exec = job.addExecutor(worker, coresToUse) - launchExecutor(worker, exec, job.desc.sparkHome) - job.state = JobState.RUNNING + val exec = app.addExecutor(worker, coresToUse) + launchExecutor(worker, exec, app.desc.sparkHome) + app.state = ApplicationState.RUNNING } } } @@ -208,8 +208,8 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor def launchExecutor(worker: WorkerInfo, exec: ExecutorInfo, sparkHome: String) { logInfo("Launching executor " + exec.fullId + " on worker " + worker.id) worker.addExecutor(exec) - worker.actor ! LaunchExecutor(exec.job.id, exec.id, exec.job.desc, exec.cores, exec.memory, sparkHome) - exec.job.driver ! ExecutorAdded(exec.id, worker.id, worker.host, exec.cores, exec.memory) + worker.actor ! LaunchExecutor(exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory, sparkHome) + exec.application.driver ! ExecutorAdded(exec.id, worker.id, worker.host, exec.cores, exec.memory) } def addWorker(id: String, host: String, port: Int, cores: Int, memory: Int, webUiPort: Int, @@ -231,46 +231,46 @@ private[spark] class Master(ip: String, port: Int, webUiPort: Int) extends Actor actorToWorker -= worker.actor addressToWorker -= worker.actor.path.address for (exec <- worker.executors.values) { - logInfo("Telling job of lost executor: " + exec.id) - exec.job.driver ! ExecutorUpdated(exec.id, ExecutorState.LOST, Some("worker lost"), None) - exec.job.removeExecutor(exec) + logInfo("Telling app of lost executor: " + exec.id) + exec.application.driver ! ExecutorUpdated(exec.id, ExecutorState.LOST, Some("worker lost"), None) + exec.application.removeExecutor(exec) } } - def addJob(desc: JobDescription, driver: ActorRef): JobInfo = { + def addApplication(desc: ApplicationDescription, driver: ActorRef): ApplicationInfo = { val now = System.currentTimeMillis() val date = new Date(now) - val job = new JobInfo(now, newJobId(date), desc, date, driver) - jobs += job - idToJob(job.id) = job - actorToJob(driver) = job - addressToJob(driver.path.address) = job - return job + val app = new ApplicationInfo(now, newApplicationId(date), desc, date, driver) + apps += app + idToApp(app.id) = app + actorToApp(driver) = app + addressToApp(driver.path.address) = app + return app } - def removeJob(job: JobInfo) { - if (jobs.contains(job)) { - logInfo("Removing job " + job.id) - jobs -= job - idToJob -= job.id - actorToJob -= job.driver - addressToWorker -= job.driver.path.address - completedJobs += job // Remember it in our history - waitingJobs -= job - for (exec <- job.executors.values) { + def removeApplication(app: ApplicationInfo) { + if (apps.contains(app)) { + logInfo("Removing app " + app.id) + apps -= app + idToApp -= app.id + actorToApp -= app.driver + addressToWorker -= app.driver.path.address + completedApps += app // Remember it in our history + waitingApps -= app + for (exec <- app.executors.values) { exec.worker.removeExecutor(exec) - exec.worker.actor ! KillExecutor(exec.job.id, exec.id) + exec.worker.actor ! KillExecutor(exec.application.id, exec.id) } - job.markFinished(JobState.FINISHED) // TODO: Mark it as FAILED if it failed + app.markFinished(ApplicationState.FINISHED) // TODO: Mark it as FAILED if it failed schedule() } } - /** Generate a new job ID given a job's submission date */ - def newJobId(submitDate: Date): String = { - val jobId = "job-%s-%04d".format(DATE_FORMAT.format(submitDate), nextJobNumber) - nextJobNumber += 1 - jobId + /** Generate a new app ID given a app's submission date */ + def newApplicationId(submitDate: Date): String = { + val appId = "app-%s-%04d".format(DATE_FORMAT.format(submitDate), nextAppNumber) + nextAppNumber += 1 + appId } /** Check for, and remove, any timed-out workers */ diff --git a/core/src/main/scala/spark/deploy/master/MasterWebUI.scala b/core/src/main/scala/spark/deploy/master/MasterWebUI.scala index 529f72e9da..54faa375fb 100644 --- a/core/src/main/scala/spark/deploy/master/MasterWebUI.scala +++ b/core/src/main/scala/spark/deploy/master/MasterWebUI.scala @@ -40,27 +40,27 @@ class MasterWebUI(val actorSystem: ActorSystem, master: ActorRef) extends Direct } } } ~ - path("job") { - parameters("jobId", 'format ?) { - case (jobId, Some(js)) if (js.equalsIgnoreCase("json")) => + path("app") { + parameters("appId", 'format ?) { + case (appId, Some(js)) if (js.equalsIgnoreCase("json")) => val future = master ? RequestMasterState - val jobInfo = for (masterState <- future.mapTo[MasterState]) yield { - masterState.activeJobs.find(_.id == jobId).getOrElse({ - masterState.completedJobs.find(_.id == jobId).getOrElse(null) + val appInfo = for (masterState <- future.mapTo[MasterState]) yield { + masterState.activeApps.find(_.id == appId).getOrElse({ + masterState.completedApps.find(_.id == appId).getOrElse(null) }) } respondWithMediaType(MediaTypes.`application/json`) { ctx => - ctx.complete(jobInfo.mapTo[JobInfo]) + ctx.complete(appInfo.mapTo[ApplicationInfo]) } - case (jobId, _) => + case (appId, _) => completeWith { val future = master ? RequestMasterState future.map { state => val masterState = state.asInstanceOf[MasterState] - val job = masterState.activeJobs.find(_.id == jobId).getOrElse({ - masterState.completedJobs.find(_.id == jobId).getOrElse(null) + val app = masterState.activeApps.find(_.id == appId).getOrElse({ + masterState.completedApps.find(_.id == appId).getOrElse(null) }) - spark.deploy.master.html.job_details.render(job) + spark.deploy.master.html.app_details.render(app) } } } diff --git a/core/src/main/scala/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/spark/deploy/master/WorkerInfo.scala index 2e467007a0..23df1bb463 100644 --- a/core/src/main/scala/spark/deploy/master/WorkerInfo.scala +++ b/core/src/main/scala/spark/deploy/master/WorkerInfo.scala @@ -37,8 +37,8 @@ private[spark] class WorkerInfo( } } - def hasExecutor(job: JobInfo): Boolean = { - executors.values.exists(_.job == job) + def hasExecutor(app: ApplicationInfo): Boolean = { + executors.values.exists(_.application == app) } def webUiAddress : String = { diff --git a/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala index 69f34e604a..de11771c8e 100644 --- a/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/spark/deploy/worker/ExecutorRunner.scala @@ -1,7 +1,7 @@ package spark.deploy.worker import java.io._ -import spark.deploy.{ExecutorState, ExecutorStateChanged, JobDescription} +import spark.deploy.{ExecutorState, ExecutorStateChanged, ApplicationDescription} import akka.actor.ActorRef import spark.{Utils, Logging} import java.net.{URI, URL} @@ -14,9 +14,9 @@ import spark.deploy.ExecutorStateChanged * Manages the execution of one executor process. */ private[spark] class ExecutorRunner( - val jobId: String, + val appId: String, val execId: Int, - val jobDesc: JobDescription, + val appDesc: ApplicationDescription, val cores: Int, val memory: Int, val worker: ActorRef, @@ -26,7 +26,7 @@ private[spark] class ExecutorRunner( val workDir: File) extends Logging { - val fullId = jobId + "/" + execId + val fullId = appId + "/" + execId var workerThread: Thread = null var process: Process = null var shutdownHook: Thread = null @@ -60,7 +60,7 @@ private[spark] class ExecutorRunner( process.destroy() process.waitFor() } - worker ! ExecutorStateChanged(jobId, execId, ExecutorState.KILLED, None, None) + worker ! ExecutorStateChanged(appId, execId, ExecutorState.KILLED, None, None) Runtime.getRuntime.removeShutdownHook(shutdownHook) } } @@ -74,10 +74,10 @@ private[spark] class ExecutorRunner( } def buildCommandSeq(): Seq[String] = { - val command = jobDesc.command - val script = if (System.getProperty("os.name").startsWith("Windows")) "run.cmd" else "run"; + val command = appDesc.command + val script = if (System.getProperty("os.name").startsWith("Windows")) "run.cmd" else "run" val runScript = new File(sparkHome, script).getCanonicalPath - Seq(runScript, command.mainClass) ++ (command.arguments ++ Seq(jobId)).map(substituteVariables) + Seq(runScript, command.mainClass) ++ (command.arguments ++ Seq(appId)).map(substituteVariables) } /** Spawn a thread that will redirect a given stream to a file */ @@ -96,12 +96,12 @@ private[spark] class ExecutorRunner( } /** - * Download and run the executor described in our JobDescription + * Download and run the executor described in our ApplicationDescription */ def fetchAndRunExecutor() { try { // Create the executor's working directory - val executorDir = new File(workDir, jobId + "/" + execId) + val executorDir = new File(workDir, appId + "/" + execId) if (!executorDir.mkdirs()) { throw new IOException("Failed to create directory " + executorDir) } @@ -110,7 +110,7 @@ private[spark] class ExecutorRunner( val command = buildCommandSeq() val builder = new ProcessBuilder(command: _*).directory(executorDir) val env = builder.environment() - for ((key, value) <- jobDesc.command.environment) { + for ((key, value) <- appDesc.command.environment) { env.put(key, value) } env.put("SPARK_MEM", memory.toString + "m") @@ -128,7 +128,7 @@ private[spark] class ExecutorRunner( // times on the same machine. val exitCode = process.waitFor() val message = "Command exited with code " + exitCode - worker ! ExecutorStateChanged(jobId, execId, ExecutorState.FAILED, Some(message), + worker ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(message), Some(exitCode)) } catch { case interrupted: InterruptedException => @@ -140,7 +140,7 @@ private[spark] class ExecutorRunner( process.destroy() } val message = e.getClass + ": " + e.getMessage - worker ! ExecutorStateChanged(jobId, execId, ExecutorState.FAILED, Some(message), None) + worker ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED, Some(message), None) } } } diff --git a/core/src/main/scala/spark/deploy/worker/Worker.scala b/core/src/main/scala/spark/deploy/worker/Worker.scala index 924935a5fd..2bbc931316 100644 --- a/core/src/main/scala/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/spark/deploy/worker/Worker.scala @@ -109,19 +109,19 @@ private[spark] class Worker( logError("Worker registration failed: " + message) System.exit(1) - case LaunchExecutor(jobId, execId, jobDesc, cores_, memory_, execSparkHome_) => - logInfo("Asked to launch executor %s/%d for %s".format(jobId, execId, jobDesc.name)) + case LaunchExecutor(appId, execId, appDesc, cores_, memory_, execSparkHome_) => + logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) val manager = new ExecutorRunner( - jobId, execId, jobDesc, cores_, memory_, self, workerId, ip, new File(execSparkHome_), workDir) - executors(jobId + "/" + execId) = manager + appId, execId, appDesc, cores_, memory_, self, workerId, ip, new File(execSparkHome_), workDir) + executors(appId + "/" + execId) = manager manager.start() coresUsed += cores_ memoryUsed += memory_ - master ! ExecutorStateChanged(jobId, execId, ExecutorState.RUNNING, None, None) + master ! ExecutorStateChanged(appId, execId, ExecutorState.RUNNING, None, None) - case ExecutorStateChanged(jobId, execId, state, message, exitStatus) => - master ! ExecutorStateChanged(jobId, execId, state, message, exitStatus) - val fullId = jobId + "/" + execId + case ExecutorStateChanged(appId, execId, state, message, exitStatus) => + master ! ExecutorStateChanged(appId, execId, state, message, exitStatus) + val fullId = appId + "/" + execId if (ExecutorState.isFinished(state)) { val executor = executors(fullId) logInfo("Executor " + fullId + " finished with state " + state + @@ -133,8 +133,8 @@ private[spark] class Worker( memoryUsed -= executor.memory } - case KillExecutor(jobId, execId) => - val fullId = jobId + "/" + execId + case KillExecutor(appId, execId) => + val fullId = appId + "/" + execId executors.get(fullId) match { case Some(executor) => logInfo("Asked to kill executor " + fullId) diff --git a/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala index 37524a7c82..08f02bad80 100644 --- a/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala +++ b/core/src/main/scala/spark/deploy/worker/WorkerArguments.scala @@ -92,7 +92,7 @@ private[spark] class WorkerArguments(args: Array[String]) { "Options:\n" + " -c CORES, --cores CORES Number of cores to use\n" + " -m MEM, --memory MEM Amount of memory to use (e.g. 1000M, 2G)\n" + - " -d DIR, --work-dir DIR Directory to run jobs in (default: SPARK_HOME/work)\n" + + " -d DIR, --work-dir DIR Directory to run apps in (default: SPARK_HOME/work)\n" + " -i IP, --ip IP IP address or DNS name to listen on\n" + " -p PORT, --port PORT Port to listen on (default: random)\n" + " --webui-port PORT Port for web UI (default: 8081)") diff --git a/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala b/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala index ef81f072a3..135cc2e86c 100644 --- a/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala +++ b/core/src/main/scala/spark/deploy/worker/WorkerWebUI.scala @@ -41,9 +41,9 @@ class WorkerWebUI(val actorSystem: ActorSystem, worker: ActorRef) extends Direct } } ~ path("log") { - parameters("jobId", "executorId", "logType") { (jobId, executorId, logType) => + parameters("appId", "executorId", "logType") { (appId, executorId, logType) => respondWithMediaType(cc.spray.http.MediaTypes.`text/plain`) { - getFromFileName("work/" + jobId + "/" + executorId + "/" + logType) + getFromFileName("work/" + appId + "/" + executorId + "/" + logType) } } } ~ diff --git a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala index e77355c6cd..bb289c9cf3 100644 --- a/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala @@ -2,14 +2,14 @@ package spark.scheduler.cluster import spark.{Utils, Logging, SparkContext} import spark.deploy.client.{Client, ClientListener} -import spark.deploy.{Command, JobDescription} +import spark.deploy.{Command, ApplicationDescription} import scala.collection.mutable.HashMap private[spark] class SparkDeploySchedulerBackend( scheduler: ClusterScheduler, sc: SparkContext, master: String, - jobName: String) + appName: String) extends StandaloneSchedulerBackend(scheduler, sc.env.actorSystem) with ClientListener with Logging { @@ -29,10 +29,11 @@ private[spark] class SparkDeploySchedulerBackend( StandaloneSchedulerBackend.ACTOR_NAME) val args = Seq(driverUrl, "{{EXECUTOR_ID}}", "{{HOSTNAME}}", "{{CORES}}") val command = Command("spark.executor.StandaloneExecutorBackend", args, sc.executorEnvs) - val sparkHome = sc.getSparkHome().getOrElse(throw new IllegalArgumentException("must supply spark home for spark standalone")) - val jobDesc = new JobDescription(jobName, maxCores, executorMemory, command, sparkHome) + val sparkHome = sc.getSparkHome().getOrElse( + throw new IllegalArgumentException("must supply spark home for spark standalone")) + val appDesc = new ApplicationDescription(appName, maxCores, executorMemory, command, sparkHome) - client = new Client(sc.env.actorSystem, master, jobDesc, this) + client = new Client(sc.env.actorSystem, master, appDesc, this) client.start() } @@ -45,8 +46,8 @@ private[spark] class SparkDeploySchedulerBackend( } } - override def connected(jobId: String) { - logInfo("Connected to Spark cluster with job ID " + jobId) + override def connected(appId: String) { + logInfo("Connected to Spark cluster with app ID " + appId) } override def disconnected() { diff --git a/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala index 7caf06e917..f4a2994b6d 100644 --- a/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/mesos/CoarseMesosSchedulerBackend.scala @@ -28,7 +28,7 @@ private[spark] class CoarseMesosSchedulerBackend( scheduler: ClusterScheduler, sc: SparkContext, master: String, - frameworkName: String) + appName: String) extends StandaloneSchedulerBackend(scheduler, sc.env.actorSystem) with MScheduler with Logging { @@ -76,7 +76,7 @@ private[spark] class CoarseMesosSchedulerBackend( setDaemon(true) override def run() { val scheduler = CoarseMesosSchedulerBackend.this - val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(frameworkName).build() + val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(appName).build() driver = new MesosSchedulerDriver(scheduler, fwInfo, master) try { { val ret = driver.run() diff --git a/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala index 300766d0f5..ca7fab4cc5 100644 --- a/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/spark/scheduler/mesos/MesosSchedulerBackend.scala @@ -24,7 +24,7 @@ private[spark] class MesosSchedulerBackend( scheduler: ClusterScheduler, sc: SparkContext, master: String, - frameworkName: String) + appName: String) extends SchedulerBackend with MScheduler with Logging { @@ -49,7 +49,7 @@ private[spark] class MesosSchedulerBackend( setDaemon(true) override def run() { val scheduler = MesosSchedulerBackend.this - val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(frameworkName).build() + val fwInfo = FrameworkInfo.newBuilder().setUser("").setName(appName).build() driver = new MesosSchedulerDriver(scheduler, fwInfo, master) try { val ret = driver.run() diff --git a/core/src/main/twirl/spark/deploy/master/app_details.scala.html b/core/src/main/twirl/spark/deploy/master/app_details.scala.html new file mode 100644 index 0000000000..301a7e2124 --- /dev/null +++ b/core/src/main/twirl/spark/deploy/master/app_details.scala.html @@ -0,0 +1,40 @@ +@(app: spark.deploy.master.ApplicationInfo) + +@spark.common.html.layout(title = "Application Details") { + + +
+
+
    +
  • ID: @app.id
  • +
  • Description: @app.desc.name
  • +
  • User: @app.desc.user
  • +
  • Cores: + @app.desc.cores + (@app.coresGranted Granted + @if(app.desc.cores == Integer.MAX_VALUE) { + + } else { + , @app.coresLeft + } + ) +
  • +
  • Memory per Slave: @app.desc.memoryPerSlave
  • +
  • Submit Date: @app.submitDate
  • +
  • State: @app.state
  • +
+
+
+ +
+ + +
+
+

Executor Summary

+
+ @executors_table(app.executors.values.toList) +
+
+ +} diff --git a/core/src/main/twirl/spark/deploy/master/app_row.scala.html b/core/src/main/twirl/spark/deploy/master/app_row.scala.html new file mode 100644 index 0000000000..feb306f35c --- /dev/null +++ b/core/src/main/twirl/spark/deploy/master/app_row.scala.html @@ -0,0 +1,20 @@ +@(app: spark.deploy.master.ApplicationInfo) + +@import spark.Utils +@import spark.deploy.WebUI.formatDate +@import spark.deploy.WebUI.formatDuration + + + + @app.id + + @app.desc.name + + @app.coresGranted + + @Utils.memoryMegabytesToString(app.desc.memoryPerSlave) + @formatDate(app.submitDate) + @app.desc.user + @app.state.toString() + @formatDuration(app.duration) + diff --git a/core/src/main/twirl/spark/deploy/master/app_table.scala.html b/core/src/main/twirl/spark/deploy/master/app_table.scala.html new file mode 100644 index 0000000000..f789cee0f1 --- /dev/null +++ b/core/src/main/twirl/spark/deploy/master/app_table.scala.html @@ -0,0 +1,21 @@ +@(apps: Array[spark.deploy.master.ApplicationInfo]) + + + + + + + + + + + + + + + + @for(j <- apps) { + @app_row(j) + } + +
IDDescriptionCoresMemory per NodeSubmit TimeUserStateDuration
diff --git a/core/src/main/twirl/spark/deploy/master/executor_row.scala.html b/core/src/main/twirl/spark/deploy/master/executor_row.scala.html index 784d692fc2..d2d80fad48 100644 --- a/core/src/main/twirl/spark/deploy/master/executor_row.scala.html +++ b/core/src/main/twirl/spark/deploy/master/executor_row.scala.html @@ -9,7 +9,7 @@ @executor.memory @executor.state - stdout - stderr + stdout + stderr - \ No newline at end of file + diff --git a/core/src/main/twirl/spark/deploy/master/index.scala.html b/core/src/main/twirl/spark/deploy/master/index.scala.html index cb1651c7e1..ac51a39a51 100644 --- a/core/src/main/twirl/spark/deploy/master/index.scala.html +++ b/core/src/main/twirl/spark/deploy/master/index.scala.html @@ -14,7 +14,7 @@ @{state.workers.map(_.coresUsed).sum} Used
  • Memory: @{Utils.memoryMegabytesToString(state.workers.map(_.memory).sum)} Total, @{Utils.memoryMegabytesToString(state.workers.map(_.memoryUsed).sum)} Used
  • -
  • Jobs: @state.activeJobs.size Running, @state.completedJobs.size Completed
  • +
  • Applications: @state.activeApps.size Running, @state.completedApps.size Completed
  • @@ -22,7 +22,7 @@
    -

    Cluster Summary

    +

    Workers


    @worker_table(state.workers.sortBy(_.id))
    @@ -30,23 +30,23 @@
    - +
    -

    Running Jobs

    +

    Running Applications


    - @job_table(state.activeJobs.sortBy(_.startTime).reverse) + @app_table(state.activeApps.sortBy(_.startTime).reverse)

    - +
    -

    Completed Jobs

    +

    Completed Applications


    - @job_table(state.completedJobs.sortBy(_.endTime).reverse) + @app_table(state.completedApps.sortBy(_.endTime).reverse)
    diff --git a/core/src/main/twirl/spark/deploy/master/job_details.scala.html b/core/src/main/twirl/spark/deploy/master/job_details.scala.html deleted file mode 100644 index d02a51b214..0000000000 --- a/core/src/main/twirl/spark/deploy/master/job_details.scala.html +++ /dev/null @@ -1,40 +0,0 @@ -@(job: spark.deploy.master.JobInfo) - -@spark.common.html.layout(title = "Job Details") { - - -
    -
    -
      -
    • ID: @job.id
    • -
    • Description: @job.desc.name
    • -
    • User: @job.desc.user
    • -
    • Cores: - @job.desc.cores - (@job.coresGranted Granted - @if(job.desc.cores == Integer.MAX_VALUE) { - - } else { - , @job.coresLeft - } - ) -
    • -
    • Memory per Slave: @job.desc.memoryPerSlave
    • -
    • Submit Date: @job.submitDate
    • -
    • State: @job.state
    • -
    -
    -
    - -
    - - -
    -
    -

    Executor Summary

    -
    - @executors_table(job.executors.values.toList) -
    -
    - -} diff --git a/core/src/main/twirl/spark/deploy/master/job_row.scala.html b/core/src/main/twirl/spark/deploy/master/job_row.scala.html deleted file mode 100644 index 7c466a6a2c..0000000000 --- a/core/src/main/twirl/spark/deploy/master/job_row.scala.html +++ /dev/null @@ -1,20 +0,0 @@ -@(job: spark.deploy.master.JobInfo) - -@import spark.Utils -@import spark.deploy.WebUI.formatDate -@import spark.deploy.WebUI.formatDuration - - - - @job.id - - @job.desc.name - - @job.coresGranted - - @Utils.memoryMegabytesToString(job.desc.memoryPerSlave) - @formatDate(job.submitDate) - @job.desc.user - @job.state.toString() - @formatDuration(job.duration) - diff --git a/core/src/main/twirl/spark/deploy/master/job_table.scala.html b/core/src/main/twirl/spark/deploy/master/job_table.scala.html deleted file mode 100644 index d267d6e85e..0000000000 --- a/core/src/main/twirl/spark/deploy/master/job_table.scala.html +++ /dev/null @@ -1,21 +0,0 @@ -@(jobs: Array[spark.deploy.master.JobInfo]) - - - - - - - - - - - - - - - - @for(j <- jobs) { - @job_row(j) - } - -
    JobIDDescriptionCoresMemory per NodeSubmit TimeUserStateDuration
    diff --git a/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html b/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html index ea9542461e..dad0a89080 100644 --- a/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html +++ b/core/src/main/twirl/spark/deploy/worker/executor_row.scala.html @@ -8,13 +8,13 @@ @Utils.memoryMegabytesToString(executor.memory)
      -
    • ID: @executor.jobId
    • -
    • Name: @executor.jobDesc.name
    • -
    • User: @executor.jobDesc.user
    • +
    • ID: @executor.appId
    • +
    • Name: @executor.appDesc.name
    • +
    • User: @executor.appDesc.user
    - stdout - stderr + stdout + stderr diff --git a/streaming/src/main/scala/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/spark/streaming/Checkpoint.scala index 2f3adb39c2..80244520a3 100644 --- a/streaming/src/main/scala/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/spark/streaming/Checkpoint.scala @@ -12,7 +12,7 @@ private[streaming] class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time) extends Logging with Serializable { val master = ssc.sc.master - val framework = ssc.sc.jobName + val framework = ssc.sc.appName val sparkHome = ssc.sc.sparkHome val jars = ssc.sc.jars val graph = ssc.graph diff --git a/streaming/src/main/scala/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/spark/streaming/StreamingContext.scala index 37ba524b48..0cce2b13cf 100644 --- a/streaming/src/main/scala/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/spark/streaming/StreamingContext.scala @@ -39,11 +39,11 @@ class StreamingContext private ( /** * Creates a StreamingContext by providing the details necessary for creating a new SparkContext. * @param master Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). - * @param frameworkName A name for your job, to display on the cluster web UI + * @param appName A name for your job, to display on the cluster web UI * @param batchDuration The time interval at which streaming data will be divided into batches */ - def this(master: String, frameworkName: String, batchDuration: Duration) = - this(StreamingContext.createNewSparkContext(master, frameworkName), null, batchDuration) + def this(master: String, appName: String, batchDuration: Duration) = + this(StreamingContext.createNewSparkContext(master, appName), null, batchDuration) /** * Re-creates a StreamingContext from a checkpoint file. @@ -384,14 +384,14 @@ object StreamingContext { new PairDStreamFunctions[K, V](stream) } - protected[streaming] def createNewSparkContext(master: String, frameworkName: String): SparkContext = { + protected[streaming] def createNewSparkContext(master: String, appName: String): SparkContext = { // Set the default cleaner delay to an hour if not already set. // This should be sufficient for even 1 second interval. if (MetadataCleaner.getDelaySeconds < 0) { MetadataCleaner.setDelaySeconds(3600) } - new SparkContext(master, frameworkName) + new SparkContext(master, appName) } protected[streaming] def rddToFileName[T](prefix: String, suffix: String, time: Time): String = { diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala index e7f446a49b..e5b5e9ac23 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala @@ -27,11 +27,11 @@ class JavaStreamingContext(val ssc: StreamingContext) { /** * Creates a StreamingContext. * @param master Name of the Spark Master - * @param frameworkName Name to be used when registering with the scheduler + * @param appName Name to be used when registering with the scheduler * @param batchDuration The time interval at which streaming data will be divided into batches */ - def this(master: String, frameworkName: String, batchDuration: Duration) = - this(new StreamingContext(master, frameworkName, batchDuration)) + def this(master: String, appName: String, batchDuration: Duration) = + this(new StreamingContext(master, appName, batchDuration)) /** * Creates a StreamingContext. -- cgit v1.2.3 From 03f45a18d57ea1eceeb688154c01a1d460744600 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Mon, 18 Feb 2013 16:56:01 -0800 Subject: Use port 5080 for httpd/ganglia --- ec2/spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 7967bcac50..89cfbad876 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -179,7 +179,7 @@ def launch_cluster(conn, opts, cluster_name): if opts.cluster_type == "mesos": master_group.authorize('tcp', 38090, 38090, '0.0.0.0/0') if opts.ganglia: - master_group.authorize('tcp', 80, 80, '0.0.0.0/0') + master_group.authorize('tcp', 5080, 5080, '0.0.0.0/0') if slave_group.rules == []: # Group was just now created slave_group.authorize(src_group=master_group) slave_group.authorize(src_group=slave_group) -- cgit v1.2.3 From e7cdf7a6a496ee76a2c53ac27514c469d929471e Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Mon, 18 Feb 2013 17:15:22 -0800 Subject: Print ganglia url after setup --- ec2/spark_ec2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 89cfbad876..6056498577 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -415,6 +415,8 @@ def setup_standalone_cluster(master, slave_nodes, opts): def setup_spark_cluster(master, opts): ssh(master, opts, "chmod u+x spark-ec2/setup.sh") ssh(master, opts, "spark-ec2/setup.sh") + if opts.ganglia: + print "Ganglia started at http://%s:5080/ganglia" % master # Wait for a whole cluster (masters, slaves and ZooKeeper) to start up -- cgit v1.2.3 From 6cba5a48b0bc1a90dd7a914a82636fcd33294822 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Mon, 18 Feb 2013 18:30:36 -0800 Subject: Print cluster url after setup completes --- ec2/spark_ec2.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 6056498577..66b1faf2cd 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -415,6 +415,11 @@ def setup_standalone_cluster(master, slave_nodes, opts): def setup_spark_cluster(master, opts): ssh(master, opts, "chmod u+x spark-ec2/setup.sh") ssh(master, opts, "spark-ec2/setup.sh") + if opts.cluster_type == "mesos": + print "Mesos cluster started at http://%s:8080" % master + elif opts.cluster_type == "standalone": + print "Spark standalone cluster started at http://%s:8080" % master + if opts.ganglia: print "Ganglia started at http://%s:5080/ganglia" % master -- cgit v1.2.3 From 9d49a6b03fb91d516bf40e50f67e87155c69dba1 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Sat, 16 Feb 2013 15:56:04 -0800 Subject: Use RDD type for `foreach` operator in Java. --- .../main/scala/spark/streaming/api/java/JavaDStream.scala | 5 ++++- .../scala/spark/streaming/api/java/JavaDStreamLike.scala | 15 +++++++++------ .../scala/spark/streaming/api/java/JavaPairDStream.scala | 4 +++- .../spark/streaming/api/java/JavaStreamingContext.scala | 2 +- streaming/src/test/java/spark/streaming/JavaAPISuite.java | 1 + .../src/test/java/spark/streaming/JavaTestUtils.scala | 5 +++-- 6 files changed, 21 insertions(+), 11 deletions(-) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStream.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStream.scala index 30985b4ebc..51efe6cae8 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStream.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStream.scala @@ -4,6 +4,7 @@ import spark.streaming.{Duration, Time, DStream} import spark.api.java.function.{Function => JFunction} import spark.api.java.JavaRDD import spark.storage.StorageLevel +import spark.RDD /** * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous @@ -26,7 +27,9 @@ import spark.storage.StorageLevel * - A function that is used to generate an RDD after each time interval */ class JavaDStream[T](val dstream: DStream[T])(implicit val classManifest: ClassManifest[T]) - extends JavaDStreamLike[T, JavaDStream[T]] { + extends JavaDStreamLike[T, JavaDStream[T], JavaRDD[T]] { + + override def wrapRDD(rdd: RDD[T]): JavaRDD[T] = JavaRDD.fromRDD(rdd) /** Return a new DStream containing only the elements that satisfy a predicate. */ def filter(f: JFunction[T, java.lang.Boolean]): JavaDStream[T] = diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index 1c1ba05ff9..4e1458ca9e 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -6,17 +6,20 @@ import java.lang.{Long => JLong} import scala.collection.JavaConversions._ import spark.streaming._ -import spark.api.java.JavaRDD +import spark.api.java.{JavaRDDLike, JavaRDD} import spark.api.java.function.{Function2 => JFunction2, Function => JFunction, _} import java.util import spark.RDD import JavaDStream._ -trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable { +trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T, R]] + extends Serializable { implicit val classManifest: ClassManifest[T] def dstream: DStream[T] + def wrapRDD(in: RDD[T]): R + implicit def scalaIntToJavaLong(in: DStream[Long]): JavaDStream[JLong] = { in.map(new JLong(_)) } @@ -220,16 +223,16 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This]] extends Serializable * Apply a function to each RDD in this DStream. This is an output operator, so * this DStream will be registered as an output stream and therefore materialized. */ - def foreach(foreachFunc: JFunction[JavaRDD[T], Void]) { - dstream.foreach(rdd => foreachFunc.call(new JavaRDD(rdd))) + def foreach(foreachFunc: JFunction[R, Void]) { + dstream.foreach(rdd => foreachFunc.call(wrapRDD(rdd))) } /** * Apply a function to each RDD in this DStream. This is an output operator, so * this DStream will be registered as an output stream and therefore materialized. */ - def foreach(foreachFunc: JFunction2[JavaRDD[T], Time, Void]) { - dstream.foreach((rdd, time) => foreachFunc.call(new JavaRDD(rdd), time)) + def foreach(foreachFunc: JFunction2[R, Time, Void]) { + dstream.foreach((rdd, time) => foreachFunc.call(wrapRDD(rdd), time)) } /** diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala index 952ca657bf..de3e802300 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala @@ -19,7 +19,9 @@ import com.google.common.base.Optional class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( implicit val kManifiest: ClassManifest[K], implicit val vManifest: ClassManifest[V]) - extends JavaDStreamLike[(K, V), JavaPairDStream[K, V]] { + extends JavaDStreamLike[(K, V), JavaPairDStream[K, V], JavaPairRDD[K, V]] { + + override def wrapRDD(rdd: RDD[(K, V)]): JavaPairRDD[K, V] = JavaPairRDD.fromRDD(rdd) // ======================================================================= // Methods common to all DStream's diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala index d9a676819a..878e179589 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaStreamingContext.scala @@ -254,7 +254,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { /** * Registers an output stream that will be computed every interval */ - def registerOutputStream(outputStream: JavaDStreamLike[_, _]) { + def registerOutputStream(outputStream: JavaDStreamLike[_, _, _]) { ssc.registerOutputStream(outputStream.dstream) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 5d510fd89f..4fe2de5a1a 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -12,6 +12,7 @@ import org.junit.Test; import scala.Tuple2; import spark.HashPartitioner; import spark.api.java.JavaRDD; +import spark.api.java.JavaRDDLike; import spark.api.java.JavaSparkContext; import spark.api.java.function.*; import spark.storage.StorageLevel; diff --git a/streaming/src/test/java/spark/streaming/JavaTestUtils.scala b/streaming/src/test/java/spark/streaming/JavaTestUtils.scala index 52ea28732a..64a7e7cbf9 100644 --- a/streaming/src/test/java/spark/streaming/JavaTestUtils.scala +++ b/streaming/src/test/java/spark/streaming/JavaTestUtils.scala @@ -31,8 +31,9 @@ trait JavaTestBase extends TestSuiteBase { * Attach a provided stream to it's associated StreamingContext as a * [[spark.streaming.TestOutputStream]]. **/ - def attachTestOutputStream[T, This <: spark.streaming.api.java.JavaDStreamLike[T,This]]( - dstream: JavaDStreamLike[T, This]) = { + def attachTestOutputStream[T, This <: spark.streaming.api.java.JavaDStreamLike[T, This, R], + R <: spark.api.java.JavaRDDLike[T, R]]( + dstream: JavaDStreamLike[T, This, R]) = { implicit val cm: ClassManifest[T] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[T]] val ostream = new TestOutputStream(dstream.dstream, -- cgit v1.2.3 From 35880de42edb30cf705036083710c85a74a351fa Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Sat, 16 Feb 2013 16:36:12 -0800 Subject: Use RDD type for `transform` operator in Java. This is an improved implementation of the `transform` operator in Java. The main difference is that this allows all four possible types of transform functions 1. JavaRDD -> JavaRDD 2. JavaRDD -> JavaPairRDD 3. JavaPairRDD -> JavaPairRDD 4. JavaPairRDD -> JavaRDD whereas previously only (1) and (3) were possible. Conflicts: streaming/src/test/java/spark/streaming/JavaAPISuite.java --- .../spark/streaming/api/java/JavaDStreamLike.scala | 40 ++++++++-- .../test/java/spark/streaming/JavaAPISuite.java | 89 +++++++++++++++++++++- 2 files changed, 122 insertions(+), 7 deletions(-) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index 4e1458ca9e..f7b1704884 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -6,7 +6,7 @@ import java.lang.{Long => JLong} import scala.collection.JavaConversions._ import spark.streaming._ -import spark.api.java.{JavaRDDLike, JavaRDD} +import spark.api.java.{JavaPairRDD, JavaRDDLike, JavaRDD} import spark.api.java.function.{Function2 => JFunction2, Function => JFunction, _} import java.util import spark.RDD @@ -239,11 +239,11 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * Return a new DStream in which each RDD is generated by applying a function * on each RDD of this DStream. */ - def transform[U](transformFunc: JFunction[JavaRDD[T], JavaRDD[U]]): JavaDStream[U] = { + def transform[U](transformFunc: JFunction[R, JavaRDD[U]]): JavaDStream[U] = { implicit val cm: ClassManifest[U] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[U]] def scalaTransform (in: RDD[T]): RDD[U] = - transformFunc.call(new JavaRDD[T](in)).rdd + transformFunc.call(wrapRDD(in)).rdd dstream.transform(scalaTransform(_)) } @@ -251,11 +251,41 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * Return a new DStream in which each RDD is generated by applying a function * on each RDD of this DStream. */ - def transform[U](transformFunc: JFunction2[JavaRDD[T], Time, JavaRDD[U]]): JavaDStream[U] = { + def transform[U](transformFunc: JFunction2[R, Time, JavaRDD[U]]): JavaDStream[U] = { implicit val cm: ClassManifest[U] = implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[U]] def scalaTransform (in: RDD[T], time: Time): RDD[U] = - transformFunc.call(new JavaRDD[T](in), time).rdd + transformFunc.call(wrapRDD(in), time).rdd + dstream.transform(scalaTransform(_, _)) + } + + /** + * Return a new DStream in which each RDD is generated by applying a function + * on each RDD of this DStream. + */ + def transform[K2, V2](transformFunc: JFunction[R, JavaPairRDD[K2, V2]]): + JavaPairDStream[K2, V2] = { + implicit val cmk: ClassManifest[K2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K2]] + implicit val cmv: ClassManifest[V2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V2]] + def scalaTransform (in: RDD[T]): RDD[(K2, V2)] = + transformFunc.call(wrapRDD(in)).rdd + dstream.transform(scalaTransform(_)) + } + + /** + * Return a new DStream in which each RDD is generated by applying a function + * on each RDD of this DStream. + */ + def transform[K2, V2](transformFunc: JFunction2[R, Time, JavaPairRDD[K2, V2]]): + JavaPairDStream[K2, V2] = { + implicit val cmk: ClassManifest[K2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[K2]] + implicit val cmv: ClassManifest[V2] = + implicitly[ClassManifest[AnyRef]].asInstanceOf[ClassManifest[V2]] + def scalaTransform (in: RDD[T], time: Time): RDD[(K2, V2)] = + transformFunc.call(wrapRDD(in), time).rdd dstream.transform(scalaTransform(_, _)) } diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 4fe2de5a1a..9be680dbdc 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -294,8 +294,9 @@ public class JavaAPISuite implements Serializable { Arrays.asList(6,7,8), Arrays.asList(9,10,11)); - JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); - JavaDStream transformed = stream.transform(new Function, JavaRDD>() { + JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); + JavaDStream transformed = + stream.transform(new Function, JavaRDD>() { @Override public JavaRDD call(JavaRDD in) throws Exception { return in.map(new Function() { @@ -742,6 +743,90 @@ public class JavaAPISuite implements Serializable { } @Test + public void testPairTransform() { + List>> inputData = Arrays.asList( + Arrays.asList( + new Tuple2(3, 5), + new Tuple2(1, 5), + new Tuple2(4, 5), + new Tuple2(2, 5)), + Arrays.asList( + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5), + new Tuple2(1, 5))); + + List>> expected = Arrays.asList( + Arrays.asList( + new Tuple2(1, 5), + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5)), + Arrays.asList( + new Tuple2(1, 5), + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5))); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream( + ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + + JavaPairDStream sorted = pairStream.transform( + new Function, JavaPairRDD>() { + @Override + public JavaPairRDD call(JavaPairRDD in) throws Exception { + return in.sortByKey(); + } + }); + + JavaTestUtils.attachTestOutputStream(sorted); + List>> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + + @Test + public void testPairToNormalRDDTransform() { + List>> inputData = Arrays.asList( + Arrays.asList( + new Tuple2(3, 5), + new Tuple2(1, 5), + new Tuple2(4, 5), + new Tuple2(2, 5)), + Arrays.asList( + new Tuple2(2, 5), + new Tuple2(3, 5), + new Tuple2(4, 5), + new Tuple2(1, 5))); + + List> expected = Arrays.asList( + Arrays.asList(3,1,4,2), + Arrays.asList(2,3,4,1)); + + JavaDStream> stream = JavaTestUtils.attachTestInputStream( + ssc, inputData, 1); + JavaPairDStream pairStream = JavaPairDStream.fromJavaDStream(stream); + + JavaDStream firstParts = pairStream.transform( + new Function, JavaRDD>() { + @Override + public JavaRDD call(JavaPairRDD in) throws Exception { + return in.map(new Function, Integer>() { + @Override + public Integer call(Tuple2 in) { + return in._1(); + } + }); + } + }); + + JavaTestUtils.attachTestOutputStream(firstParts); + List> result = JavaTestUtils.runStreams(ssc, 2, 2); + + Assert.assertEquals(expected, result); + } + public void testMapValues() { List>> inputData = stringStringKVStream; -- cgit v1.2.3 From fed1122d74c9d92ce26c28b0bf429e556dcc9bdd Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Sat, 16 Feb 2013 16:43:23 -0800 Subject: Use RDD type for `slice` operator in Java. This commit uses the RDD type in `slice`, making it available to both normal and pair RDD's in java. It also updates the signature for `slice` to match changes in the Scala API. --- .../src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala index f7b1704884..8be36200ec 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaDStreamLike.scala @@ -215,8 +215,8 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T /** * Return all the RDDs between 'fromDuration' to 'toDuration' (both included) */ - def slice(fromDuration: Duration, toDuration: Duration): JList[JavaRDD[T]] = { - new util.ArrayList(dstream.slice(fromDuration, toDuration).map(new JavaRDD(_)).toSeq) + def slice(fromTime: Time, toTime: Time): JList[R] = { + new util.ArrayList(dstream.slice(fromTime, toTime).map(wrapRDD(_)).toSeq) } /** -- cgit v1.2.3 From 041c19e5f0309e7a667faab5fee9f9081db58237 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 19 Feb 2013 08:44:20 -0800 Subject: Small changes that were missing in merge --- streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala | 1 + streaming/src/test/java/spark/streaming/JavaAPISuite.java | 1 + 2 files changed, 2 insertions(+) diff --git a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala index de3e802300..c1c8783559 100644 --- a/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/spark/streaming/api/java/JavaPairDStream.scala @@ -15,6 +15,7 @@ import org.apache.hadoop.conf.Configuration import spark.api.java.JavaPairRDD import spark.storage.StorageLevel import com.google.common.base.Optional +import spark.RDD class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( implicit val kManifiest: ClassManifest[K], diff --git a/streaming/src/test/java/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/spark/streaming/JavaAPISuite.java index 9be680dbdc..53fac14386 100644 --- a/streaming/src/test/java/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/spark/streaming/JavaAPISuite.java @@ -13,6 +13,7 @@ import scala.Tuple2; import spark.HashPartitioner; import spark.api.java.JavaRDD; import spark.api.java.JavaRDDLike; +import spark.api.java.JavaPairRDD; import spark.api.java.JavaSparkContext; import spark.api.java.function.*; import spark.storage.StorageLevel; -- cgit v1.2.3 From 687581c3ec2b6b8310bd5be9f2d15b25b9051aac Mon Sep 17 00:00:00 2001 From: Charles Reiss Date: Tue, 19 Feb 2013 11:52:35 -0800 Subject: Paranoid uncaught exception handling for exceptions during shutdown --- core/src/main/scala/spark/executor/Executor.scala | 29 ++++++++++++++++++----- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/spark/executor/Executor.scala b/core/src/main/scala/spark/executor/Executor.scala index bd21ba719a..b63bec11ad 100644 --- a/core/src/main/scala/spark/executor/Executor.scala +++ b/core/src/main/scala/spark/executor/Executor.scala @@ -50,14 +50,31 @@ private[spark] class Executor extends Logging { override def uncaughtException(thread: Thread, exception: Throwable) { try { logError("Uncaught exception in thread " + thread, exception) - if (exception.isInstanceOf[OutOfMemoryError]) { - System.exit(ExecutorExitCode.OOM) - } else { - System.exit(ExecutorExitCode.UNCAUGHT_EXCEPTION) + + // We may have been called from a shutdown hook. If so, we must not call System.exit(). + // (If we do, we will deadlock.) Runtime#addShutdownHook should fail if we are shutting + // down, which would either occur if we were called from a shutdown hook or if + // a System.exit() occured concurrently. + var shuttingDown = false + try { + val hook = new Thread { + override def run() {} + } + Runtime.getRuntime.addShutdownHook(hook) + Runtime.getRuntime.removeShutdownHook(hook) + } catch { + case ise: IllegalStateException => shuttingDown = true + } + if (!shuttingDown) { + if (exception.isInstanceOf[OutOfMemoryError]) { + System.exit(ExecutorExitCode.OOM) + } else { + System.exit(ExecutorExitCode.UNCAUGHT_EXCEPTION) + } } } catch { - case oom: OutOfMemoryError => System.exit(ExecutorExitCode.OOM) - case t: Throwable => System.exit(ExecutorExitCode.UNCAUGHT_EXCEPTION_TWICE) + case oom: OutOfMemoryError => Runtime.getRuntime.halt(ExecutorExitCode.OOM) + case t: Throwable => Runtime.getRuntime.halt(ExecutorExitCode.UNCAUGHT_EXCEPTION_TWICE) } } } -- cgit v1.2.3 From d0588bd6d7da3ba5adaba24303ad8616bdc2484f Mon Sep 17 00:00:00 2001 From: Charles Reiss Date: Tue, 19 Feb 2013 11:53:01 -0800 Subject: Catch/log errors deleting temp dirs --- core/src/main/scala/spark/storage/DiskStore.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/spark/storage/DiskStore.scala b/core/src/main/scala/spark/storage/DiskStore.scala index 7e5b820cbb..ddbf8821ad 100644 --- a/core/src/main/scala/spark/storage/DiskStore.scala +++ b/core/src/main/scala/spark/storage/DiskStore.scala @@ -178,7 +178,11 @@ private class DiskStore(blockManager: BlockManager, rootDirs: String) Runtime.getRuntime.addShutdownHook(new Thread("delete Spark local dirs") { override def run() { logDebug("Shutdown hook called") - localDirs.foreach(localDir => Utils.deleteRecursively(localDir)) + try { + localDirs.foreach(localDir => Utils.deleteRecursively(localDir)) + } catch { + case t: Throwable => logError("Exception while deleting local spark dirs", t) + } } }) } -- cgit v1.2.3 From 130f704bafe9e327e8974f6ed3a4e00c478f6279 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 19 Feb 2013 16:03:52 -0800 Subject: Added a method to create PartitionPruningRDD. --- core/src/main/scala/spark/rdd/PartitionPruningRDD.scala | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala b/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala index f2f4fd56d1..41ff62dd22 100644 --- a/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala +++ b/core/src/main/scala/spark/rdd/PartitionPruningRDD.scala @@ -40,3 +40,15 @@ class PartitionPruningRDD[T: ClassManifest]( override protected def getPartitions: Array[Partition] = getDependencies.head.asInstanceOf[PruneDependency[T]].partitions } + + +object PartitionPruningRDD { + + /** + * Create a PartitionPruningRDD. This function can be used to create the PartitionPruningRDD + * when its type T is not known at compile time. + */ + def create[T](rdd: RDD[T], partitionFilterFunc: Int => Boolean) = { + new PartitionPruningRDD[T](rdd, partitionFilterFunc)(rdd.elementClassManifest) + } +} -- cgit v1.2.3 From ecd137a72da189c52b92a1286b004740706bd936 Mon Sep 17 00:00:00 2001 From: Andy Konwinski Date: Tue, 19 Feb 2013 16:58:02 -0800 Subject: Fixes link to issue tracker in documentation page "Contributing to Spark". --- docs/_config.yml | 1 + docs/contributing-to-spark.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/_config.yml b/docs/_config.yml index 2bd2eecc86..09617e4a1e 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -7,3 +7,4 @@ SPARK_VERSION: 0.7.0-SNAPSHOT SPARK_VERSION_SHORT: 0.7.0 SCALA_VERSION: 2.9.2 MESOS_VERSION: 0.9.0-incubating +SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net diff --git a/docs/contributing-to-spark.md b/docs/contributing-to-spark.md index 14d0dc856b..50feeb2d6c 100644 --- a/docs/contributing-to-spark.md +++ b/docs/contributing-to-spark.md @@ -15,7 +15,7 @@ The Spark team welcomes contributions in the form of GitHub pull requests. Here But first, make sure that you have [configured a spark-env.sh](configuration.html) with at least `SCALA_HOME`, as some of the tests try to spawn subprocesses using this. - Add new unit tests for your code. We use [ScalaTest](http://www.scalatest.org/) for testing. Just add a new Suite in `core/src/test`, or methods to an existing Suite. -- If you'd like to report a bug but don't have time to fix it, you can still post it to our [issue tracker](https://spark-project.atlassian.net), or email the [mailing list](http://www.spark-project.org/mailing-lists.html). +- If you'd like to report a bug but don't have time to fix it, you can still post it to our [issue tracker]({{site.SPARK_ISSUE_TRACKER_URL}}), or email the [mailing list](http://www.spark-project.org/mailing-lists.html). # Licensing of Contributions -- cgit v1.2.3 From 092c631fa8da6381b814f4d262c884ba08629b39 Mon Sep 17 00:00:00 2001 From: Charles Reiss Date: Tue, 19 Feb 2013 17:49:55 -0800 Subject: Pull detection of being in a shutdown hook into utility function. --- core/src/main/scala/spark/Utils.scala | 21 +++++++++++++++++++++ core/src/main/scala/spark/executor/Executor.scala | 16 ++-------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/core/src/main/scala/spark/Utils.scala b/core/src/main/scala/spark/Utils.scala index 28d643abca..81daacf958 100644 --- a/core/src/main/scala/spark/Utils.scala +++ b/core/src/main/scala/spark/Utils.scala @@ -454,4 +454,25 @@ private object Utils extends Logging { def clone[T](value: T, serializer: SerializerInstance): T = { serializer.deserialize[T](serializer.serialize(value)) } + + /** + * Detect whether this thread might be executing a shutdown hook. Will always return true if + * the current thread is a running a shutdown hook but may spuriously return true otherwise (e.g. + * if System.exit was just called by a concurrent thread). + * + * Currently, this detects whether the JVM is shutting down by Runtime#addShutdownHook throwing + * an IllegalStateException. + */ + def inShutdown(): Boolean = { + try { + val hook = new Thread { + override def run() {} + } + Runtime.getRuntime.addShutdownHook(hook) + Runtime.getRuntime.removeShutdownHook(hook) + } catch { + case ise: IllegalStateException => return true + } + return false + } } diff --git a/core/src/main/scala/spark/executor/Executor.scala b/core/src/main/scala/spark/executor/Executor.scala index b63bec11ad..5de09030aa 100644 --- a/core/src/main/scala/spark/executor/Executor.scala +++ b/core/src/main/scala/spark/executor/Executor.scala @@ -52,20 +52,8 @@ private[spark] class Executor extends Logging { logError("Uncaught exception in thread " + thread, exception) // We may have been called from a shutdown hook. If so, we must not call System.exit(). - // (If we do, we will deadlock.) Runtime#addShutdownHook should fail if we are shutting - // down, which would either occur if we were called from a shutdown hook or if - // a System.exit() occured concurrently. - var shuttingDown = false - try { - val hook = new Thread { - override def run() {} - } - Runtime.getRuntime.addShutdownHook(hook) - Runtime.getRuntime.removeShutdownHook(hook) - } catch { - case ise: IllegalStateException => shuttingDown = true - } - if (!shuttingDown) { + // (If we do, we will deadlock.) + if (!Utils.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(ExecutorExitCode.OOM) } else { -- cgit v1.2.3 From 05dc385649277836962a512a83195083990a7134 Mon Sep 17 00:00:00 2001 From: Prashant Sharma Date: Wed, 20 Feb 2013 15:28:12 +0530 Subject: A bug fix post merge, following changes to AkkaUtils --- examples/src/main/scala/spark/streaming/examples/ActorWordCount.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/scala/spark/streaming/examples/ActorWordCount.scala b/examples/src/main/scala/spark/streaming/examples/ActorWordCount.scala index 71b4e5bf1a..346151c147 100644 --- a/examples/src/main/scala/spark/streaming/examples/ActorWordCount.scala +++ b/examples/src/main/scala/spark/streaming/examples/ActorWordCount.scala @@ -147,7 +147,7 @@ object ActorWordCount { */ val lines = ssc.actorStream[String]( - Props(new SampleActorReceiver[String]("akka://spark@%s:%s/user/FeederActor".format( + Props(new SampleActorReceiver[String]("akka://test@%s:%s/user/FeederActor".format( host, port.toInt))), "SampleReceiver") //compute wordcount -- cgit v1.2.3 From 334ab9244113e4b792fd51697ef80ab0d3b3de25 Mon Sep 17 00:00:00 2001 From: Tathagata Das Date: Wed, 20 Feb 2013 10:26:36 -0800 Subject: Fixed bug in CheckpointSuite --- core/src/main/scala/spark/rdd/CheckpointRDD.scala | 8 ++++---- core/src/test/scala/spark/CheckpointSuite.scala | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/spark/rdd/CheckpointRDD.scala index 3328477959..9e37bdf659 100644 --- a/core/src/main/scala/spark/rdd/CheckpointRDD.scala +++ b/core/src/main/scala/spark/rdd/CheckpointRDD.scala @@ -22,10 +22,10 @@ class CheckpointRDD[T: ClassManifest](sc: SparkContext, val checkpointPath: Stri override def getPartitions: Array[Partition] = { val dirContents = fs.listStatus(new Path(checkpointPath)) - val splitFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted - val numPartitions = splitFiles.size - if (numPartitions > 0 && !splitFiles(0).endsWith(CheckpointRDD.splitIdToFile(0)) || - !splitFiles(numPartitions-1).endsWith(CheckpointRDD.splitIdToFile(numPartitions-1))) { + val partitionFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted + val numPartitions = partitionFiles.size + if (numPartitions > 0 && (! partitionFiles(0).endsWith(CheckpointRDD.splitIdToFile(0)) || + ! partitionFiles(numPartitions-1).endsWith(CheckpointRDD.splitIdToFile(numPartitions-1)))) { throw new SparkException("Invalid checkpoint directory: " + checkpointPath) } Array.tabulate(numPartitions)(i => new CheckpointRDDPartition(i)) diff --git a/core/src/test/scala/spark/CheckpointSuite.scala b/core/src/test/scala/spark/CheckpointSuite.scala index 1935ac9e49..ca385972fb 100644 --- a/core/src/test/scala/spark/CheckpointSuite.scala +++ b/core/src/test/scala/spark/CheckpointSuite.scala @@ -164,12 +164,12 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging { test("CheckpointRDD with zero partitions") { val rdd = new BlockRDD[Int](sc, Array[String]()) - assert(rdd.splits.size === 0) + assert(rdd.partitions.size === 0) assert(rdd.isCheckpointed === false) rdd.checkpoint() assert(rdd.count() === 0) assert(rdd.isCheckpointed === true) - assert(rdd.splits.size === 0) + assert(rdd.partitions.size === 0) } /** -- cgit v1.2.3