diff options
Diffstat (limited to 'site/examples.html')
-rw-r--r-- | site/examples.html | 518 |
1 files changed, 350 insertions, 168 deletions
diff --git a/site/examples.html b/site/examples.html index edf2661e6..7992062c3 100644 --- a/site/examples.html +++ b/site/examples.html @@ -1,27 +1,20 @@ <!DOCTYPE html> -<!--[if IE 6]> -<html id="ie6" dir="ltr" lang="en-US"> -<![endif]--> -<!--[if IE 7]> -<html id="ie7" dir="ltr" lang="en-US"> -<![endif]--> -<!--[if IE 8]> -<html id="ie8" dir="ltr" lang="en-US"> -<![endif]--> -<!--[if !(IE 6) | !(IE 7) | !(IE 8) ]><!--> -<html dir="ltr" lang="en-US"> -<!--<![endif]--> +<html lang="en"> <head> - <link rel="shortcut icon" href="/favicon.ico" /> - <meta charset="UTF-8" /> - <meta name="viewport" content="width=device-width" /> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title> Examples | Apache Spark </title> - <link rel="stylesheet" type="text/css" media="all" href="/css/style.css" /> - <link rel="stylesheet" href="/css/pygments-default.css"> + + + <!-- Bootstrap core CSS --> + <link href="/css/cerulean.min.css" rel="stylesheet"> + <link href="/css/custom.css" rel="stylesheet"> <script type="text/javascript"> <!-- Google Analytics initialization --> @@ -46,128 +39,212 @@ } </script> - <link rel='canonical' href='/index.html' /> - - <style type="text/css"> - #site-title, - #site-description { - position: absolute !important; - clip: rect(1px 1px 1px 1px); /* IE6, IE7 */ - clip: rect(1px, 1px, 1px, 1px); - } - </style> - <style type="text/css" id="custom-background-css"> - body.custom-background { background-color: #f1f1f1; } - </style> + <!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries --> + <!--[if lt IE 9]> + <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script> + <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script> + <![endif]--> </head> -<!--body class="page singular"--> -<body class="page singular"> -<div id="page" class="hfeed"> - - <header id="branding" role="banner"> - <hgroup> - <h1 id="site-title"><span><a href="/" title="Spark" rel="home">Spark</a></span></h1> - <h2 id="site-description">Lightning-Fast Cluster Computing</h2> - </hgroup> - - <a id="main-logo" href="/"> - <img style="height:175px; width:auto;" src="/images/spark-project-header1-cropped.png" alt="Spark: Lightning-Fast Cluster Computing" title="Spark: Lightning-Fast Cluster Computing" /> - </a> - <div class="widget-summit"> - <a href="http://spark-summit.org"><img src="/images/Summit-Logo-FINALtr-150x150px.png" /></a> - <div class="text"> - <a href="http://spark-summit.org/2013"> - - <strong>Videos and Slides<br/> - Available Now!</strong> - </a> - </div> +<body> + +<div class="container" style="max-width: 1200px;"> + +<div class="masthead"> + + <p class="lead"> + <a href="/"> + <img src="/images/spark-logo.png" + style="height:100px; width:auto; vertical-align: bottom; margin-top: 20px;"></a><span class="tagline"> + Lightning-fast cluster computing + </span> + </p> + +</div> + +<nav class="navbar navbar-default" role="navigation"> + <!-- Brand and toggle get grouped for better mobile display --> + <div class="navbar-header"> + <button type="button" class="navbar-toggle" data-toggle="collapse" + data-target="#navbar-collapse-1"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> </div> - <nav id="access" role="navigation"> - <h3 class="assistive-text">Main menu</h3> - <div class="menu-main-menu-container"> - <ul id="menu-main-menu" class="menu"> - - <li class="menu-item menu-item-type-post_type menu-item-object-page "> - <a href="/index.html">Home</a> - </li> - - <li class="menu-item menu-item-type-post_type menu-item-object-page "> - <a href="/downloads.html">Downloads</a> - </li> - - <li class="menu-item menu-item-type-post_type menu-item-object-page "> - <a href="/documentation.html">Documentation</a> - </li> - - <li class="menu-item menu-item-type-post_type menu-item-object-page current-menu-item"> - <a href="/examples.html">Examples</a> - </li> + <!-- Collect the nav links, forms, and other content for toggling --> + <div class="collapse navbar-collapse" id="navbar-collapse-1"> + <ul class="nav navbar-nav"> + <li><a href="/downloads.html">Download</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown"> + Related Projects <b class="caret"></b> + </a> + <ul class="dropdown-menu"> + <li><a href="http://shark.cs.berkeley.edu">Shark (SQL)</a></li> + <li><a href="/streaming/">Spark Streaming</a></li> + <li><a href="/mllib/">MLlib (machine learning)</a></li> + <li><a href="http://amplab.github.io/graphx/">GraphX (graph)</a></li> + </ul> + </li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown"> + Documentation <b class="caret"></b> + </a> + <ul class="dropdown-menu"> + <li><a href="/documentation.html">Overview</a></li> + <li><a href="/docs/latest/">Latest Release</a></li> + <li><a href="/examples.html">Examples</a></li> + </ul> + </li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown"> + Community <b class="caret"></b> + </a> + <ul class="dropdown-menu"> + <li><a href="/community.html">Mailing Lists</a></li> + <li><a href="/community.html#events">Events and Meetups</a></li> + <li><a href="/community.html#history">Project History</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Powered+By+Spark">Powered By</a></li> + </ul> + </li> + <li><a href="/faq.html">FAQ</a></li> + </ul> + </div> + <!-- /.navbar-collapse --> +</nav> + + +<div class="row"> + <div class="col-md-3 col-md-push-9"> + <div class="news" style="margin-bottom: 20px;"> + <h5>Latest News</h5> + <ul class="list-unstyled"> - <li class="menu-item menu-item-type-post_type menu-item-object-page "> - <a href="/mailing-lists.html">Mailing Lists</a> - </li> + <li><a href="/news/spark-0-8-1-released.html">Spark 0.8.1 released</a> + <span class="small">(Dec 19, 2013)</span></li> - <li class="menu-item menu-item-type-post_type menu-item-object-page "> - <a href="/research.html">Research</a> - </li> + <li><a href="/news/spark-summit-2013-is-a-wrap.html">Spark Summit 2013 is a Wrap</a> + <span class="small">(Dec 15, 2013)</span></li> - <li class="menu-item menu-item-type-post_type menu-item-object-page "> - <a href="/faq.html">FAQ</a> - </li> + <li><a href="/news/announcing-the-first-spark-summit.html">Announcing the first Spark Summit: December 2, 2013</a> + <span class="small">(Oct 08, 2013)</span></li> - </ul></div> - </nav><!-- #access --> -</header><!-- #branding --> - - - - <div id="main"> - <div id="primary"> - <div id="content" role="main"> + <li><a href="/news/spark-0-8-0-released.html">Spark 0.8.0 released</a> + <span class="small">(Sep 25, 2013)</span></li> - <article class="page type-page status-publish hentry"> - <h2>Spark Examples</h2> + </ul> + <p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p> + </div> + <div class="hidden-xs hidden-sm"> + <a href="/downloads.html" class="btn btn-success btn-lg btn-block" style="margin-bottom: 30px;"> + Download Spark + </a> + <p style="font-size: 16px; font-weight: 500; color: #555;"> + Related Projects: + </p> + <ul class="list-narrow"> + <li><a href="http://shark.cs.berkeley.edu">Shark (SQL)</a></li> + <li><a href="/streaming/">Spark Streaming</a></li> + <li><a href="/mllib/">MLlib (machine learning)</a></li> + <li><a href="http://amplab.github.io/graphx/">GraphX (graph)</a></li> + </ul> + </div> + </div> -<p>Spark is built around <em>distributed datasets</em> that support types of parallel operations: transformations, which are lazy and yield another distributed dataset (e.g., <code>map</code>, <code>filter</code>, and <code>join</code>), and actions, which force the computation of a dataset and return a result (e.g., <code>count</code>). The following examples show off some of the available operations and features. Several additional examples are distributed with Spark:</p> + <div class="col-md-9 col-md-pull-3"> + <h2>Spark Examples</h2> -<ul> - <li>Core Spark: <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/examples">Scala examples</a>, <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/examples">Java examples</a>, <a href="https://github.com/apache/incubator-spark/tree/master/python/examples">Python examples</a></li> - <li>Streaming Spark: <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples">Scala examples</a>, <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/streaming/examples">Java examples</a></li> -</ul> +<p>These examples give a quick overview of the Spark API. +Spark is built on the concept of <em>distributed datasets</em>, which contain arbitrary Java or +Python objects. You create a dataset from external data, then apply parallel operations +to it. There are two types of operations: <em>transformations</em>, which define a new dataset based on +previous ones, and <em>actions</em>, which kick off a job to execute on a cluster.</p> <h3>Text Search</h3> <p>In this example, we search through the error messages in a log file:</p> -<p> -</p> -<div class="code"> -<span class="keyword">val</span> file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> -<span class="keyword">val</span> errors = file.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("ERROR")</span>)<br /> -<span class="comment">// Count all the errors</span><br /> -errors.<span class="sparkop">count</span>()<br /> -<span class="comment">// Count errors mentioning MySQL</span><br /> -errors.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("MySQL")</span>).<span class="sparkop">count</span>()<br /> -<span class="comment">// Fetch the MySQL errors as an array of strings</span><br /> -errors.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("MySQL")</span>).<span class="sparkop">collect</span>()<br /> +<ul class="nav nav-tabs"> + <li class="lang-tab lang-tab-scala active"><a href="#">Scala</a></li> + <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> + <li class="lang-tab lang-tab-python"><a href="#">Python</a></li> +</ul> +<div class="tab-content"> + <div class="tab-pane tab-pane-scala active"> + <div class="code code-tab"> + <span class="keyword">val</span> file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> + <span class="keyword">val</span> errors = file.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("ERROR")</span>)<br /> + <span class="comment">// Count all the errors</span><br /> + errors.<span class="sparkop">count</span>()<br /> + <span class="comment">// Count errors mentioning MySQL</span><br /> + errors.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("MySQL")</span>).<span class="sparkop">count</span>()<br /> + <span class="comment">// Fetch the MySQL errors as an array of strings</span><br /> + errors.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("MySQL")</span>).<span class="sparkop">collect</span>()<br /> + </div> + </div> + <div class="tab-pane tab-pane-java"> + <div class="code code-tab"> + JavaRDD<String> file = spark.textFile(<span class="string">"hdfs://..."</span>);<br /> + JavaRDD<String> errors = file.<span class="sparkop">filter</span>(<span class="closure">new Function<String, Boolean>() {<br /> + public Boolean call(String s) { return s.contains("ERROR"); }<br /> + }</span>);<br /> + <span class="comment">// Count all the errors</span><br /> + errors.<span class="sparkop">count</span>();<br /> + <span class="comment">// Count errors mentioning MySQL</span><br /> + errors.<span class="sparkop">filter</span>(<span class="closure">new Function<String, Boolean>() {<br /> + public Boolean call(String s) { return s.contains("MySQL"); }<br /> + }</span>).<span class="sparkop">count</span>();<br /> + <span class="comment">// Fetch the MySQL errors as an array of strings</span><br /> + errors.<span class="sparkop">filter</span>(<span class="closure">new Function<String, Boolean>() {<br /> + public Boolean call(String s) { return s.contains("MySQL"); }<br /> + }</span>).<span class="sparkop">collect</span>();<br /> + </div> + </div> + <div class="tab-pane tab-pane-python"> + <div class="code code-tab"> + file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> + errors = file.<span class="sparkop">filter</span>(<span class="closure">lambda line: "ERROR" in line</span>)<br /> + <span class="comment"># Count all the errors</span><br /> + errors.<span class="sparkop">count</span>()<br /> + <span class="comment"># Count errors mentioning MySQL</span><br /> + errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">count</span>()<br /> + <span class="comment"># Fetch the MySQL errors as an array of strings</span><br /> + errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">collect</span>()<br /> + </div> + </div> </div> -<p></p> -<p>The red code fragments are Scala function literals (closures) that get passed automatically to the cluster. The blue ones are Spark operations.</p> +<p>The red code fragments are function literals (closures) that get passed automatically to the cluster. The blue ones are Spark operations.</p> <h3>In-Memory Text Search</h3> <p>Spark can <em>cache</em> datasets in memory to speed up reuse. In the example above, we can load just the error messages in RAM using:</p> -<p> -</p> -<div class="code"> -errors.<span class="sparkop">cache</span>() +<ul class="nav nav-tabs"> + <li class="lang-tab lang-tab-scala active"><a href="#">Scala</a></li> + <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> + <li class="lang-tab lang-tab-python"><a href="#">Python</a></li> +</ul> +<div class="tab-content"> + <div class="tab-pane tab-pane-scala active"> + <div class="code code-tab"> + errors.<span class="sparkop">cache</span>() + </div> + </div> + <div class="tab-pane tab-pane-java"> + <div class="code code-tab"> + errors.<span class="sparkop">cache</span>(); + </div> + </div> + <div class="tab-pane tab-pane-python"> + <div class="code code-tab"> + errors.<span class="sparkop">cache</span>() + </div> + </div> </div> -<p></p> <p>After the first action that uses <code>errors</code>, later ones will be much faster.</p> @@ -175,80 +252,185 @@ errors.<span class="sparkop">cache</span>() <p>In this example, we use a few more transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p> -<p> -</p> -<div class="code"> -<span class="keyword">val</span> file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> -<span class="keyword">val</span> counts = file.<span class="sparkop">flatMap</span>(<span class="closure">line => line.split(" ")</span>)<br /> - .<span class="sparkop">map</span>(<span class="closure">word => (word, 1)</span>)<br /> - .<span class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)<br /> -counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>) +<ul class="nav nav-tabs"> + <li class="lang-tab lang-tab-scala active"><a href="#">Scala</a></li> + <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> + <li class="lang-tab lang-tab-python"><a href="#">Python</a></li> +</ul> +<div class="tab-content"> + <div class="tab-pane tab-pane-scala active"> + <div class="code code-tab"> + <span class="keyword">val</span> file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> + <span class="keyword">val</span> counts = file.<span class="sparkop">flatMap</span>(<span class="closure">line => line.split(" ")</span>)<br /> + .<span class="sparkop">map</span>(<span class="closure">word => (word, 1)</span>)<br /> + .<span class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)<br /> + counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>) + </div> + </div> + <div class="tab-pane tab-pane-java"> + <div class="code code-tab"> + JavaRDD<String> file = spark.textFile(<span class="string">"hdfs://..."</span>);<br /> + JavaRDD<String> words = file.<span class="sparkop">flatMap</span>(<span class="closure">new FlatMapFunction<String, String>()<br /> + public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); }<br /> + }</span>);<br /> + JavaPairRDD<String, Integer> pairs = words.<span class="sparkop">map</span>(<span class="closure">new PairFunction<String, String, Integer>()<br /> + public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); }<br /> + }</span>);<br /> + JavaPairRDD<String, Integer> counts = pairs.<span class="sparkop">reduceByKey</span>(<span class="closure">new Function2<Integer, Integer>()<br /> + public Integer call(Integer a, Integer b) { return a + b; }<br /> + }</span>);<br /> + counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>); + </div> + </div> + <div class="tab-pane tab-pane-python"> + <div class="code code-tab"> + file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> + counts = file.<span class="sparkop">flatMap</span>(<span class="closure">lambda line: line.split(" ")</span>) \<br /> + .<span class="sparkop">map</span>(<span class="closure">lambda word: (word, 1)</span>) \<br /> + .<span class="sparkop">reduceByKey</span>(<span class="closure">lambda a, b: a + b</span>)<br /> + counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>) + </div> + </div> </div> -<p></p> <h3>Estimating Pi</h3> <p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">π</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">π / 4</span>, so we use this to get our estimate.</p> -<p> -</p> -<div class="code"> -<span class="keyword">val</span> count = spark.parallelize(1 to NUM_SAMPLES).<span class="sparkop">map</span>(<span class="closure">i =><br /> - <span class="keyword">val</span> x = Math.random<br /> - <span class="keyword">val</span> y = Math.random<br /> - <span class="keyword">if</span> (x*x + y*y < 1) 1.0 <span class="keyword">else</span> 0.0<br /> -</span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br /> -println(<span class="string">"Pi is roughly "</span> + 4 * count / NUM_SAMPLES)<br /> +<ul class="nav nav-tabs"> + <li class="lang-tab lang-tab-scala active"><a href="#">Scala</a></li> + <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> + <li class="lang-tab lang-tab-python"><a href="#">Python</a></li> +</ul> +<div class="tab-content"> + <div class="tab-pane tab-pane-scala active"> + <div class="code code-tab"> + <span class="keyword">val</span> count = spark.parallelize(1 to NUM_SAMPLES).<span class="sparkop">map</span>(<span class="closure">i =><br /> + val x = Math.random()<br /> + val y = Math.random()<br /> + if (x*x + y*y < 1) 1 else 0<br /> + </span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br /> + println(<span class="string">"Pi is roughly "</span> + 4.0 * count / NUM_SAMPLES)<br /> + </div> + </div> + <div class="tab-pane tab-pane-java"> + <div class="code code-tab"> + <span class="keyword">int</span> count = spark.parallelize(makeRange(1, NUM_SAMPLES)).<span class="sparkop">filter</span>(<span class="closure">new Function<Integer, Boolean>() {<br /> + public Integer call(Integer i) {<br /> + double x = Math.random();<br /> + double y = Math.random();<br /> + return x*x + y*y < 1;<br /> + }<br /> + }</span>).<span class="sparkop">count</span>();<br /> + System.out.println(<span class="string">"Pi is roughly "</span> + 4 * count / NUM_SAMPLES);<br /> + </div> + </div> + <div class="tab-pane tab-pane-python"> + <div class="code code-tab"> + <span class="keyword">def</span> sample(p):<br /> + x, y = random(), random()<br /> + <span class="keyword">return</span> 1 <span class="keyword">if</span> x*x + y*y < 1 <span class="keyword">else</span> 0<br /><br /> + count = spark.parallelize(xrange(0, NUM_SAMPLES)).<span class="sparkop">map</span>(<span class="closure">sample</span>) \<br /> + .<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br /> + print <span class="string">"Pi is roughly %f"</span> % (4.0 * count / NUM_SAMPLES)<br /> + </div> + </div> </div> -<p></p> <h3>Logistic Regression</h3> -<p>This is an iterative machine learning algorithm that seeks to find the best hyperplane that separates two sets of points in a multi-dimensional feature space. It can be used to classify messages into spam vs non-spam, for example. Because the algorithm applies the same MapReduce operation repeatedly to the same dataset, it benefits greatly from caching the input data in RAM across iterations.</p> +<p>This is an iterative machine learning algorithm that seeks to find the best hyperplane that separates two sets of points in a multi-dimensional feature space. It can be used to classify messages into spam vs non-spam, for example. Because the algorithm applies the same MapReduce operation repeatedly to the same dataset, it benefits greatly from caching the input in RAM across iterations.</p> -<p> -</p> -<div class="code"> -<span class="keyword">val</span> points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br /> -<span class="keyword">var</span> w = Vector.random(D) <span class="comment">// current separating plane</span><br /> -<span class="keyword">for</span> (i <- 1 to ITERATIONS) {<br /> - <span class="keyword">val</span> gradient = points.<span class="sparkop">map</span>(<span class="closure">p =><br /> - (1 / (1 + exp(-p.y*(w dot p.x))) - 1) * p.y * p.x<br /> - </span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br /> - w -= gradient<br /> -}<br /> -println(<span class="string">"Final separating plane: "</span> + w)<br /> +<ul class="nav nav-tabs"> + <li class="lang-tab lang-tab-scala active"><a href="#">Scala</a></li> + <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> + <li class="lang-tab lang-tab-python"><a href="#">Python</a></li> +</ul> +<div class="tab-content"> + <div class="tab-pane tab-pane-scala active"> + <div class="code code-tab"> + <span class="keyword">val</span> points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br /> + <span class="keyword">var</span> w = Vector.random(D) <span class="comment">// current separating plane</span><br /> + <span class="keyword">for</span> (i <- 1 to ITERATIONS) {<br /> + <span class="keyword">val</span> gradient = points.<span class="sparkop">map</span>(<span class="closure">p =><br /> + (1 / (1 + exp(-p.y*(w dot p.x))) - 1) * p.y * p.x<br /> + </span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br /> + w -= gradient<br /> + }<br /> + println(<span class="string">"Final separating plane: "</span> + w)<br /> + </div> + </div> + <div class="tab-pane tab-pane-java"> + <div class="code code-tab"> + <span class="keyword">class</span> ComputeGradient <span class="keyword">extends</span> Function<DataPoint, Vector> {<br /> + <span class="keyword">private</span> Vector w;<br /> + ComputeGradient(Vector w) { <span class="keyword">this</span>.w = w; }<br /> + <span class="keyword">public</span> Vector call(DataPoint p) {<br /> + <span class="keyword">return</span> p.x.times(p.y * (1 / (1 + Math.exp(w.dot(p.x))) - 1));<br /> + }<br /> + }<br /> + <br /> + JavaRDD<DataPoint> points = spark.textFile(...).<span class="sparkop">map</span>(<span class="closure">new ParsePoint()</span>).<span class="sparkop">cache</span>();<br /> + Vector w = Vector.random(D); <span class="comment">// current separating plane</span><br /> + <span class="keyword">for</span> (<span class="keyword">int</span> i = 0; i < ITERATIONS; i++) {<br /> + Vector gradient = points.<span class="sparkop">map</span>(<span class="closure">new ComputeGradient(w)</span>).<span class="sparkop">reduce</span>(<span class="closure">new AddVectors()</span>);<br /> + w = w.subtract(gradient);<br /> + }<br /> + System.out.println(<span class="string">"Final separating plane: "</span> + w);<br /> + </div> + </div> + <div class="tab-pane tab-pane-python"> + <div class="code code-tab"> + points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br /> + w = numpy.random.ranf(size = D) <span class="comment"># current separating plane</span><br /> + <span class="keyword">for</span> i <span class="keyword">in</span> range(ITERATIONS):<br /> + gradient = points.<span class="sparkop">map</span>(<span class="closure"><br /> + lambda p: (1 / (1 + exp(-p.y*(w.dot(p.x)))) - 1) * p.y * p.x<br /> + </span>).<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br /> + w -= gradient<br /> + print <span class="string">"Final separating plane: %s"</span> % w<br /> + </div> + </div> </div> -<p></p> -<p>Note that <code>w</code> gets shipped automatically to the cluster with every <code>map</code> call.</p> +<p>Note that the current separating plane, <code>w</code>, gets shipped automatically to the cluster with every <code>map</code> call.</p> -<p>The graph below compares the performance of this Spark program against a Hadoop implementation on 30 GB of data on an 80-core cluster, showing the benefit of in-memory caching:</p> +<p>The graph below compares the running time per iteration of this Spark program against a Hadoop implementation on 100 GB of data on a 100-node cluster, showing the benefit of in-memory caching:</p> -<p align="center"> -<img src="/images/spark-lr.png" alt="Logistic regression performance in Spark vs Hadoop" /> +<p style="margin-top: 20px; margin-bottom: 30px;"> +<img src="/images/logistic-regression.png" alt="Logistic regression performance in Spark vs Hadoop" /> </p> +<p><a name="additional"></a></p> +<h2>Additional Examples</h2> + +<p>Many additional examples are distributed with Spark:</p> + +<ul> + <li>Basic Spark: <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/examples">Scala examples</a>, <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/examples">Java examples</a>, <a href="https://github.com/apache/incubator-spark/tree/master/python/examples">Python examples</a></li> + <li>Spark Streaming: <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples">Scala examples</a>, <a href="https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/streaming/examples">Java examples</a></li> +</ul> + - </article><!-- #post --> - - </div><!-- #content --> - - <footer id="colophon" role="contentinfo"> - <div id="site-generator"> - <p style="padding-top: 0; padding-bottom: 15px;"> - Apache Spark is an effort undergoing incubation at The Apache Software Foundation. - <a href="http://incubator.apache.org/" style="border: none;"> - <img style="vertical-align: middle; border: none;" src="/images/incubator-logo.png" alt="Apache Incubator" title="Apache Incubator" /> - </a> - </p> </div> -</footer><!-- #colophon --> +</div> - </div><!-- #primary --> - </div><!-- #main --> -</div><!-- #page --> +<footer class="small"> + <hr> + Apache Spark is an effort undergoing incubation at The Apache Software Foundation. + <a href="http://incubator.apache.org/" style="border: none;"> + <img style="vertical-align: middle; float: right; margin-bottom: 15px;" + src="/images/incubator-logo.png" alt="Apache Incubator" title="Apache Incubator" /> + </a> +</footer> + +</div> + +<script src="https://code.jquery.com/jquery.js"></script> +<script src="//netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script> +<script src="/js/lang-tabs.js"></script> + </body> </html> |