summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYin Huai <yhuai@apache.org>2016-01-25 21:57:32 +0000
committerYin Huai <yhuai@apache.org>2016-01-25 21:57:32 +0000
commitfd803ebda51509807dcb8f4db312809ea449c987 (patch)
tree2424f3556409ac61ce47e8a1e7e0c6b59ab812f0
parentd6ebe19825e676953342d069738f0026b985215d (diff)
downloadspark-website-fd803ebda51509807dcb8f4db312809ea449c987.tar.gz
spark-website-fd803ebda51509807dcb8f4db312809ea449c987.tar.bz2
spark-website-fd803ebda51509807dcb8f4db312809ea449c987.zip
Update the Spark example page to include examples using high level APIs
-rw-r--r--_config.yml4
-rw-r--r--_layouts/global.html3
-rw-r--r--examples.md561
-rw-r--r--site/community.html3
-rw-r--r--site/documentation.html3
-rw-r--r--site/downloads.html3
-rw-r--r--site/examples.html535
-rw-r--r--site/faq.html3
-rw-r--r--site/graphx/index.html3
-rw-r--r--site/index.html3
-rw-r--r--site/mailing-lists.html3
-rw-r--r--site/mllib/index.html3
-rw-r--r--site/news/amp-camp-2013-registration-ope.html3
-rw-r--r--site/news/announcing-the-first-spark-summit.html3
-rw-r--r--site/news/fourth-spark-screencast-published.html3
-rw-r--r--site/news/index.html3
-rw-r--r--site/news/nsdi-paper.html3
-rw-r--r--site/news/one-month-to-spark-summit-2015.html3
-rw-r--r--site/news/proposals-open-for-spark-summit-east.html3
-rw-r--r--site/news/registration-open-for-spark-summit-east.html3
-rw-r--r--site/news/run-spark-and-shark-on-amazon-emr.html3
-rw-r--r--site/news/spark-0-6-1-and-0-5-2-released.html3
-rw-r--r--site/news/spark-0-6-2-released.html3
-rw-r--r--site/news/spark-0-7-0-released.html3
-rw-r--r--site/news/spark-0-7-2-released.html3
-rw-r--r--site/news/spark-0-7-3-released.html3
-rw-r--r--site/news/spark-0-8-0-released.html3
-rw-r--r--site/news/spark-0-8-1-released.html3
-rw-r--r--site/news/spark-0-9-0-released.html3
-rw-r--r--site/news/spark-0-9-1-released.html3
-rw-r--r--site/news/spark-0-9-2-released.html3
-rw-r--r--site/news/spark-1-0-0-released.html3
-rw-r--r--site/news/spark-1-0-1-released.html3
-rw-r--r--site/news/spark-1-0-2-released.html3
-rw-r--r--site/news/spark-1-1-0-released.html3
-rw-r--r--site/news/spark-1-1-1-released.html3
-rw-r--r--site/news/spark-1-2-0-released.html3
-rw-r--r--site/news/spark-1-2-1-released.html3
-rw-r--r--site/news/spark-1-2-2-released.html3
-rw-r--r--site/news/spark-1-3-0-released.html3
-rw-r--r--site/news/spark-1-4-0-released.html3
-rw-r--r--site/news/spark-1-4-1-released.html3
-rw-r--r--site/news/spark-1-5-0-released.html3
-rw-r--r--site/news/spark-1-5-1-released.html3
-rw-r--r--site/news/spark-1-5-2-released.html3
-rw-r--r--site/news/spark-1-6-0-released.html3
-rw-r--r--site/news/spark-accepted-into-apache-incubator.html3
-rw-r--r--site/news/spark-and-shark-in-the-news.html3
-rw-r--r--site/news/spark-becomes-tlp.html3
-rw-r--r--site/news/spark-featured-in-wired.html3
-rw-r--r--site/news/spark-mailing-lists-moving-to-apache.html3
-rw-r--r--site/news/spark-meetups.html3
-rw-r--r--site/news/spark-screencasts-published.html3
-rw-r--r--site/news/spark-summit-2013-is-a-wrap.html3
-rw-r--r--site/news/spark-summit-2014-videos-posted.html3
-rw-r--r--site/news/spark-summit-2015-videos-posted.html3
-rw-r--r--site/news/spark-summit-agenda-posted.html3
-rw-r--r--site/news/spark-summit-east-2015-videos-posted.html3
-rw-r--r--site/news/spark-summit-east-2016-cfp-closing.html3
-rw-r--r--site/news/spark-summit-east-agenda-posted.html3
-rw-r--r--site/news/spark-summit-europe-agenda-posted.html3
-rw-r--r--site/news/spark-summit-europe.html3
-rw-r--r--site/news/spark-tips-from-quantifind.html3
-rw-r--r--site/news/spark-user-survey-and-powered-by-page.html3
-rw-r--r--site/news/spark-version-0-6-0-released.html3
-rw-r--r--site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html3
-rw-r--r--site/news/strata-exercises-now-available-online.html3
-rw-r--r--site/news/submit-talks-to-spark-summit-2014.html3
-rw-r--r--site/news/submit-talks-to-spark-summit-east-2016.html3
-rw-r--r--site/news/two-weeks-to-spark-summit-2014.html3
-rw-r--r--site/news/video-from-first-spark-development-meetup.html3
-rw-r--r--site/releases/spark-release-0-3.html3
-rw-r--r--site/releases/spark-release-0-5-0.html3
-rw-r--r--site/releases/spark-release-0-5-1.html3
-rw-r--r--site/releases/spark-release-0-5-2.html3
-rw-r--r--site/releases/spark-release-0-6-0.html3
-rw-r--r--site/releases/spark-release-0-6-1.html3
-rw-r--r--site/releases/spark-release-0-6-2.html3
-rw-r--r--site/releases/spark-release-0-7-0.html3
-rw-r--r--site/releases/spark-release-0-7-2.html3
-rw-r--r--site/releases/spark-release-0-7-3.html3
-rw-r--r--site/releases/spark-release-0-8-0.html3
-rw-r--r--site/releases/spark-release-0-8-1.html3
-rw-r--r--site/releases/spark-release-0-9-0.html3
-rw-r--r--site/releases/spark-release-0-9-1.html3
-rw-r--r--site/releases/spark-release-0-9-2.html3
-rw-r--r--site/releases/spark-release-1-0-0.html3
-rw-r--r--site/releases/spark-release-1-0-1.html3
-rw-r--r--site/releases/spark-release-1-0-2.html3
-rw-r--r--site/releases/spark-release-1-1-0.html3
-rw-r--r--site/releases/spark-release-1-1-1.html3
-rw-r--r--site/releases/spark-release-1-2-0.html3
-rw-r--r--site/releases/spark-release-1-2-1.html3
-rw-r--r--site/releases/spark-release-1-2-2.html3
-rw-r--r--site/releases/spark-release-1-3-0.html3
-rw-r--r--site/releases/spark-release-1-3-1.html3
-rw-r--r--site/releases/spark-release-1-4-0.html3
-rw-r--r--site/releases/spark-release-1-4-1.html3
-rw-r--r--site/releases/spark-release-1-5-0.html3
-rw-r--r--site/releases/spark-release-1-5-1.html3
-rw-r--r--site/releases/spark-release-1-5-2.html3
-rw-r--r--site/releases/spark-release-1-6-0.html3
-rw-r--r--site/research.html3
-rw-r--r--site/screencasts/1-first-steps-with-spark.html3
-rw-r--r--site/screencasts/2-spark-documentation-overview.html3
-rw-r--r--site/screencasts/3-transformations-and-caching.html3
-rw-r--r--site/screencasts/4-a-standalone-job-in-spark.html3
-rw-r--r--site/screencasts/index.html3
-rw-r--r--site/sql/index.html3
-rw-r--r--site/streaming/index.html3
110 files changed, 1027 insertions, 394 deletions
diff --git a/_config.yml b/_config.yml
index 2e0857523..9ea9998ef 100644
--- a/_config.yml
+++ b/_config.yml
@@ -1,4 +1,6 @@
-pygments: true
+# pygments option has been renamed to highlighter.
+# pygments: true
+highlighter: pygments
markdown: kramdown
kramdown:
entity_output: symbol
diff --git a/_layouts/global.html b/_layouts/global.html
index ee34775a3..15556b7c7 100644
--- a/_layouts/global.html
+++ b/_layouts/global.html
@@ -24,6 +24,9 @@
<link href="{{site.url}}css/cerulean.min.css" rel="stylesheet">
<link href="{{site.url}}css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="{{site.url}}css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/examples.md b/examples.md
index f7ad08f74..bd2a52bf4 100644
--- a/examples.md
+++ b/examples.md
@@ -6,258 +6,411 @@ navigation:
weight: 4
show: true
---
-<h2>Spark Examples</h2>
+<h1>Spark Examples</h1>
These examples give a quick overview of the Spark API.
Spark is built on the concept of <em>distributed datasets</em>, which contain arbitrary Java or
Python objects. You create a dataset from external data, then apply parallel operations
-to it. There are two types of operations: <em>transformations</em>, which define a new dataset based on
-previous ones, and <em>actions</em>, which kick off a job to execute on a cluster.
+to it. The building block of the Spark API is its [RDD API](http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds).
+In the RDD API,
+there are two types of operations: <em>transformations</em>, which define a new dataset based on previous ones,
+and <em>actions</em>, which kick off a job to execute on a cluster.
+On top of Spark’s RDD API, high level APIs are provided, e.g.
+[DataFrame API](http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes) and
+[Machine Learning API](http://spark.apache.org/docs/latest/mllib-guide.html).
+These high level APIs provide a concise way to conduct certain data operations.
+In this page, we will show examples using RDD API as well as examples using high level APIs.
-<h3>Text Search</h3>
+<h2>RDD API Examples</h2>
-In this example, we search through the error messages in a log file:
+<h3>Word Count</h3>
+<p>In this example, we use a few transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
- errors = text_file.<span class="sparkop">filter</span>(<span class="closure">lambda line: "ERROR" in line</span>)<br />
- <span class="comment"># Count all the errors</span><br>
- errors.<span class="sparkop">count</span>()<br>
- <span class="comment"># Count errors mentioning MySQL</span><br>
- errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">count</span>()<br>
- <span class="comment"># Fetch the MySQL errors as an array of strings</span><br>
- errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">collect</span>()<br>
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> textFile = spark.textFile(<span class="string">"hdfs://..."</span>)<br>
- <span class="keyword">val</span> errors = textFile.<span class="sparkop">filter</span>(<span class="closure">line =&gt; line.contains("ERROR")</span>)<br>
- <span class="comment">// Count all the errors</span><br>
- errors.<span class="sparkop">count</span>()<br>
- <span class="comment">// Count errors mentioning MySQL</span><br>
- errors.<span class="sparkop">filter</span>(<span class="closure">line =&gt; line.contains("MySQL")</span>).<span class="sparkop">count</span>()<br>
- <span class="comment">// Fetch the MySQL errors as an array of strings</span><br>
- errors.<span class="sparkop">filter</span>(<span class="closure">line =&gt; line.contains("MySQL")</span>).<span class="sparkop">collect</span>()<br>
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- JavaRDD&lt;String&gt; textFile = spark.textFile(<span class="string">"hdfs://..."</span>);<br>
- JavaRDD&lt;String&gt; errors = textFile.<span class="sparkop">filter</span>(<span class="closure">new Function&lt;String, Boolean&gt;() {<br>
- &nbsp;&nbsp;public Boolean call(String s) { return s.contains("ERROR"); }<br>
- }</span>);<br>
- <span class="comment">// Count all the errors</span><br>
- errors.<span class="sparkop">count</span>();<br>
- <span class="comment">// Count errors mentioning MySQL</span><br>
- errors.<span class="sparkop">filter</span>(<span class="closure">new Function&lt;String, Boolean&gt;() {<br>
- &nbsp;&nbsp;public Boolean call(String s) { return s.contains("MySQL"); }<br>
- }</span>).<span class="sparkop">count</span>();<br>
- <span class="comment">// Fetch the MySQL errors as an array of strings</span><br>
- errors.<span class="sparkop">filter</span>(<span class="closure">new Function&lt;String, Boolean&gt;() {<br>
- &nbsp;&nbsp;public Boolean call(String s) { return s.contains("MySQL"); }<br>
- }</span>).<span class="sparkop">collect</span>();<br>
- </div>
- </div>
-</div>
-
-<p>The red code fragments are function literals (closures) that get passed automatically to the cluster. The blue ones are Spark operations.</p>
-
-<h3>In-Memory Text Search</h3>
-
-<p>Spark can <em>cache</em> datasets in memory to speed up reuse. In the example above, we can load just the error messages in RAM using:</p>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+text_file = sc.textFile("hdfs://...")
+counts = text_file.flatMap(lambda line: line.split(" ")) \
+ .map(lambda word: (word, 1)) \
+ .reduceByKey(lambda a, b: a + b)
+counts.saveAsTextFile("hdfs://...")
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+val textFile = sc.textFile("hdfs://...")
+val counts = textFile.flatMap(line => line.split(" "))
+ .map(word => (word, 1))
+ .reduceByKey(_ + _)
+counts.saveAsTextFile("hdfs://...")
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+JavaRDD<String> textFile = sc.textFile("hdfs://...");
+JavaRDD<String> words = textFile.flatMap(new FlatMapFunction<String, String>() {
+ public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); }
+});
+JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
+ public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); }
+});
+JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
+ public Integer call(Integer a, Integer b) { return a + b; }
+});
+counts.saveAsTextFile("hdfs://...");
+{% endhighlight %}
+</div>
+</div>
+</div>
+
+<h3>Pi Estimation</h3>
+<p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">π</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">π / 4</span>, so we use this to get our estimate.</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- errors.<span class="sparkop">cache</span>()
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- errors.<span class="sparkop">cache</span>()
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- errors.<span class="sparkop">cache</span>();
- </div>
- </div>
-</div>
-
-<p>After the first action that uses <code>errors</code>, later ones will be much faster.</p>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+def sample(p):
+ x, y = random(), random()
+ return 1 if x*x + y*y < 1 else 0
+count = sc.parallelize(xrange(0, NUM_SAMPLES)).map(sample) \
+ .reduce(lambda a, b: a + b)
+print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES)
+{% endhighlight %}
+</div>
+</div>
-<h3>Word Count</h3>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+val count = sc.parallelize(1 to NUM_SAMPLES).map{i =>
+ val x = Math.random()
+ val y = Math.random()
+ if (x*x + y*y < 1) 1 else 0
+}.reduce(_ + _)
+println("Pi is roughly " + 4.0 * count / NUM_SAMPLES)
+{% endhighlight %}
+</div>
+</div>
-<p>In this example, we use a few more transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p>
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+List<Integer> l = new ArrayList<Integer>(NUM_SAMPLES);
+for (int i = 0; i < NUM_SAMPLES; i++) {
+ l.add(i);
+}
+
+long count = sc.parallelize(l).filter(new Function<Integer, Boolean>() {
+ public Boolean call(Integer i) {
+ double x = Math.random();
+ double y = Math.random();
+ return x*x + y*y < 1;
+ }
+}).count();
+System.out.println("Pi is roughly " + 4.0 * count / NUM_SAMPLES);
+{% endhighlight %}
+</div>
+</div>
+</div>
+
+<h2>DataFrame API Examples</h2>
+<p>
+In Spark, a <a href="http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes">DataFrame</a>
+is a distributed collection of data organized into named columns.
+Users can use DataFrame API to perform various relational operations on both external
+data sources and Spark’s built-in distributed collections without providing specific procedures for processing data.
+Also, programs based on DataFrame API will be automatically optimized by Spark’s built-in optimizer, Catalyst.
+</p>
+
+<h3>Text Search</h3>
+<p>In this example, we search through the error messages in a log file.</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br>
- counts = text_file.<span class="sparkop">flatMap</span>(<span class="closure">lambda line: line.split(" ")</span>) \<br>
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">map</span>(<span class="closure">lambda word: (word, 1)</span>) \<br>
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduceByKey</span>(<span class="closure">lambda a, b: a + b</span>)<br>
- counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>)
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> textFile = spark.textFile(<span class="string">"hdfs://..."</span>)<br>
- <span class="keyword">val</span> counts = textFile.<span class="sparkop">flatMap</span>(<span class="closure">line =&gt; line.split(" ")</span>)<br>
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">map</span>(<span class="closure">word =&gt; (word, 1)</span>)<br>
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)<br>
- counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>)
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- JavaRDD&lt;String&gt; textFile = spark.textFile(<span class="string">"hdfs://..."</span>);<br>
- JavaRDD&lt;String&gt; words = textFile.<span class="sparkop">flatMap</span>(<span class="closure">new FlatMapFunction&lt;String, String&gt;() {<br>
- &nbsp;&nbsp;public Iterable&lt;String&gt; call(String s) { return Arrays.asList(s.split(" ")); }<br>
- }</span>);<br>
- JavaPairRDD&lt;String, Integer&gt; pairs = words.<span class="sparkop">mapToPair</span>(<span class="closure">new PairFunction&lt;String, String, Integer&gt;() {<br>
- &nbsp;&nbsp;public Tuple2&lt;String, Integer&gt; call(String s) { return new Tuple2&lt;String, Integer&gt;(s, 1); }<br>
- }</span>);<br>
- JavaPairRDD&lt;String, Integer&gt; counts = pairs.<span class="sparkop">reduceByKey</span>(<span class="closure">new Function2&lt;Integer, Integer, Integer&gt;() {<br>
- &nbsp;&nbsp;public Integer call(Integer a, Integer b) { return a + b; }<br>
- }</span>);<br>
- counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>);
- </div>
- </div>
-</div>
-
-<h3>Estimating Pi</h3>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+textFile = sc.textFile("hdfs://...")
-<p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">π</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">π / 4</span>, so we use this to get our estimate.</p>
+# Creates a DataFrame having a single column named "line"
+df = textFile.map(lambda r: Row(r)).toDF(["line"])
+errors = df.filter(col("line").like("%ERROR%"))
+# Counts all the errors
+errors.count()
+# Counts errors mentioning MySQL
+errors.filter(col("line").like("%MySQL%")).count()
+# Fetches the MySQL errors as an array of strings
+errors.filter(col("line").like("%MySQL%")).collect()
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+val textFile = sc.textFile("hdfs://...")
+
+// Creates a DataFrame having a single column named "line"
+val df = textFile.toDF("line")
+val errors = df.filter(col("line").like("%ERROR%"))
+// Counts all the errors
+errors.count()
+// Counts errors mentioning MySQL
+errors.filter(col("line").like("%MySQL%")).count()
+// Fetches the MySQL errors as an array of strings
+errors.filter(col("line").like("%MySQL%")).collect()
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+// Creates a DataFrame having a single column named "line"
+JavaRDD<String> textFile = sc.textFile("hdfs://...");
+JavaRDD<Row> rowRDD = textFile.map(
+ new Function<String, Row>() {
+ public Row call(String line) throws Exception {
+ return RowFactory.create(line);
+ }
+ });
+List<StructField> fields = new ArrayList<StructField>();
+fields.add(DataTypes.createStructField("line", DataTypes.StringType, true));
+StructType schema = DataTypes.createStructType(fields);
+DataFrame df = sqlContext.createDataFrame(rowRDD, schema);
+
+DataFrame errors = df.filter(col("line").like("%ERROR%"));
+// Counts all the errors
+errors.count();
+// Counts errors mentioning MySQL
+errors.filter(col("line").like("%MySQL%")).count();
+// Fetches the MySQL errors as an array of strings
+errors.filter(col("line").like("%MySQL%")).collect();
+{% endhighlight %}
+</div>
+</div>
+</div>
+
+<h3>Simple Data Operations</h3>
+<p>
+In this example, we read a table stored in a database and calculate the number of people for every age.
+Finally, we save the calculated result to S3 in the format of JSON.
+A simple MySQL table "people" is used in the example and this table has two columns,
+"name" and "age".
+</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- <span class="keyword">def</span> sample(p):<br>
- &nbsp;&nbsp;&nbsp;&nbsp;x, y = random(), random()<br>
- &nbsp;&nbsp;&nbsp;&nbsp;<span class="keyword">return</span> 1 <span class="keyword">if</span> x*x + y*y < 1 <span class="keyword">else</span> 0<br><br>
- count = spark.parallelize(xrange(0, NUM_SAMPLES)).<span class="sparkop">map</span>(<span class="closure">sample</span>) \<br>
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br>
- print <span class="string">"Pi is roughly %f"</span> % (4.0 * count / NUM_SAMPLES)<br>
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> count = spark.parallelize(1 to NUM_SAMPLES).<span class="sparkop">map</span>{<span class="closure">i =&gt;<br>
- &nbsp;&nbsp;val x = Math.random()<br>
- &nbsp;&nbsp;val y = Math.random()<br>
- &nbsp;&nbsp;if (x*x + y*y &lt; 1) 1 else 0<br>
- </span>}.<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br>
- println(<span class="string">"Pi is roughly "</span> + 4.0 * count / NUM_SAMPLES)<br>
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- <span class="keyword">int</span> count = spark.parallelize(makeRange(1, NUM_SAMPLES)).<span class="sparkop">filter</span>(<span class="closure">new Function&lt;Integer, Boolean&gt;() {<br>
- &nbsp;&nbsp;public Boolean call(Integer i) {<br>
- &nbsp;&nbsp;&nbsp;&nbsp;double x = Math.random();<br>
- &nbsp;&nbsp;&nbsp;&nbsp;double y = Math.random();<br>
- &nbsp;&nbsp;&nbsp;&nbsp;return x*x + y*y &lt; 1;<br>
- &nbsp;&nbsp;}<br>
- }</span>).<span class="sparkop">count</span>();<br>
- System.out.println(<span class="string">"Pi is roughly "</span> + 4 * count / NUM_SAMPLES);<br>
- </div>
- </div>
-</div>
-
-<h3>Logistic Regression</h3>
-
-<p>This is an iterative machine learning algorithm that seeks to find the best hyperplane that separates two sets of points in a multi-dimensional feature space. It can be used to classify messages into spam vs non-spam, for example. Because the algorithm applies the same MapReduce operation repeatedly to the same dataset, it benefits greatly from caching the input in RAM across iterations.</p>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+# Creates a DataFrame based on a table named "people"
+# stored in a MySQL database.
+url = \
+ "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword"
+df = sqlContext \
+ .read \
+ .format("jdbc") \
+ .option("url", url) \
+ .option("dbtable", "people") \
+ .load()
+
+# Looks the schema of this DataFrame.
+df.printSchema()
+
+# Counts people by age
+countsByAge = df.groupBy("age").count()
+countsByAge.show()
+
+# Saves countsByAge to S3 in the JSON format.
+countsByAge.write.format("json").save("s3a://...")
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+// Creates a DataFrame based on a table named "people"
+// stored in a MySQL database.
+val url =
+ "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword"
+val df = sqlContext
+ .read
+ .format("jdbc")
+ .option("url", url)
+ .option("dbtable", "people")
+ .load()
+
+// Looks the schema of this DataFrame.
+df.printSchema()
+
+// Counts people by age
+val countsByAge = df.groupBy("age").count()
+countsByAge.show()
+
+// Saves countsByAge to S3 in the JSON format.
+countsByAge.write.format("json").save("s3a://...")
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+// Creates a DataFrame based on a table named "people"
+// stored in a MySQL database.
+String url =
+ "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword";
+DataFrame df = sqlContext
+ .read()
+ .format("jdbc")
+ .option("url", url)
+ .option("dbtable", "people")
+ .load();
+
+// Looks the schema of this DataFrame.
+df.printSchema();
+
+// Counts people by age
+DataFrame countsByAge = df.groupBy("age").count();
+countsByAge.show();
+
+// Saves countsByAge to S3 in the JSON format.
+countsByAge.write().format("json").save("s3a://...");
+{% endhighlight %}
+</div>
+</div>
+</div>
+
+<h2>Machine Learning Example</h2>
+<p>
+<a href="http://spark.apache.org/docs/latest/mllib-guide.html">MLlib</a>, Spark’s Machine Learning (ML) library, provides many distributed ML algorithms.
+These algorithms cover tasks such as feature extraction, classification, regression, clustering,
+recommendation, and more.
+MLlib also provides tools such as ML Pipelines for building workflows, CrossValidator for tuning parameters,
+and model persistence for saving and loading models.
+</p>
+
+<h3>Prediction with Logistic Regression</h3>
+<p>
+In this example, we take a dataset of labels and feature vectors.
+We learn to predict the labels from feature vectors using the Logistic Regression algorithm.
+</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br>
- w = numpy.random.ranf(size = D) <span class="comment"># current separating plane</span><br>
- <span class="keyword">for</span> i <span class="keyword">in</span> range(ITERATIONS):<br>
- &nbsp;&nbsp;&nbsp;&nbsp;gradient = points.<span class="sparkop">map</span>(<span class="closure"><br>
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;lambda p: (1 / (1 + exp(-p.y*(w.dot(p.x)))) - 1) * p.y * p.x<br>
- &nbsp;&nbsp;&nbsp;&nbsp;</span>).<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br>
- &nbsp;&nbsp;&nbsp;&nbsp;w -= gradient<br>
- print <span class="string">"Final separating plane: %s"</span> % w<br>
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br>
- <span class="keyword">var</span> w = Vector.random(D) <span class="comment">// current separating plane</span><br>
- <span class="keyword">for</span> (i &lt;- 1 to ITERATIONS) {<br>
- &nbsp;&nbsp;<span class="keyword">val</span> gradient = points.<span class="sparkop">map</span>(<span class="closure">p =&gt;<br>
- &nbsp;&nbsp;&nbsp;&nbsp;(1 / (1 + exp(-p.y*(w dot p.x))) - 1) * p.y * p.x<br>
- &nbsp;&nbsp;</span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br>
- &nbsp;&nbsp;w -= gradient<br>
- }<br>
- println(<span class="string">"Final separating plane: "</span> + w)<br>
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- <span class="keyword">class</span> ComputeGradient <span class="keyword">extends</span> Function&lt;DataPoint, Vector&gt; {<br>
- &nbsp;&nbsp;<span class="keyword">private</span> Vector w;<br>
- &nbsp;&nbsp;ComputeGradient(Vector w) { <span class="keyword">this</span>.w = w; }<br>
- &nbsp;&nbsp;<span class="keyword">public</span> Vector call(DataPoint p) {<br>
- &nbsp;&nbsp;&nbsp;&nbsp;<span class="keyword">return</span> p.x.times(p.y * (1 / (1 + Math.exp(w.dot(p.x))) - 1));<br>
- &nbsp;&nbsp;}<br>
- }<br>
- <br>
- JavaRDD&lt;DataPoint&gt; points = spark.textFile(...).<span class="sparkop">map</span>(<span class="closure">new ParsePoint()</span>).<span class="sparkop">cache</span>();<br>
- Vector w = Vector.random(D); <span class="comment">// current separating plane</span><br>
- <span class="keyword">for</span> (<span class="keyword">int</span> i = 0; i &lt; ITERATIONS; i++) {<br>
- &nbsp;&nbsp;Vector gradient = points.<span class="sparkop">map</span>(<span class="closure">new ComputeGradient(w)</span>).<span class="sparkop">reduce</span>(<span class="closure">new AddVectors()</span>);<br>
- &nbsp;&nbsp;w = w.subtract(gradient);<br>
- }<br>
- System.out.println(<span class="string">"Final separating plane: "</span> + w);<br>
- </div>
- </div>
-</div>
-
-<p>Note that the current separating plane, <code>w</code>, gets shipped automatically to the cluster with every <code>map</code> call.</p>
-
-<p>The graph below compares the running time per iteration of this Spark program against a Hadoop implementation on 100 GB of data on a 100-node cluster, showing the benefit of in-memory caching:</p>
-
-<p style="margin-top: 20px; margin-bottom: 30px;">
-<img src="{{site.url}}images/logistic-regression.png" alt="Logistic regression performance in Spark vs Hadoop">
-</p>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+# Every record of this DataFrame contains the label and
+# features represented by a vector.
+df = sqlContext.createDataFrame(data, ["label", "features"])
+
+# Set parameters for the algorithm.
+# Here, we limit the number of iterations to 10.
+lr = LogisticRegression(maxIter=10)
+
+# Fit the model to the data.
+model = lr.fit(df)
+
+# Given a dataset, predict each point's label, and show the results.
+model.transform(df).show()
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+// Every record of this DataFrame contains the label and
+// features represented by a vector.
+val df = sqlContext.createDataFrame(data).toDF("label", "features")
+
+// Set parameters for the algorithm.
+// Here, we limit the number of iterations to 10.
+val lr = new LogisticRegression().setMaxIter(10)
+
+// Fit the model to the data.
+val model = lr.fit(df)
+
+// Inspect the model: get the feature weights.
+val weights = model.weights
+
+// Given a dataset, predict each point's label, and show the results.
+model.transform(df).show()
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+// Every record of this DataFrame contains the label and
+// features represented by a vector.
+StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+
+// Set parameters for the algorithm.
+// Here, we limit the number of iterations to 10.
+LogisticRegression lr = new LogisticRegression().setMaxIter(10);
+
+// Fit the model to the data.
+LogisticRegressionModel model = lr.fit(df);
+
+// Inspect the model: get the feature weights.
+Vector weights = model.weights();
+
+// Given a dataset, predict each point's label, and show the results.
+model.transform(df).show();
+{% endhighlight %}
+</div>
+</div>
+</div>
<a name="additional"></a>
-<h2>Additional Examples</h2>
+<h1>Additional Examples</h1>
Many additional examples are distributed with Spark:
* Basic Spark: [Scala examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples), [Java examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples), [Python examples](https://github.com/apache/spark/tree/master/examples/src/main/python)
- * Spark Streaming: [Scala examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming), [Java examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming)
-
+ * Spark Streaming: [Scala examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming), [Java examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming) \ No newline at end of file
diff --git a/site/community.html b/site/community.html
index 2ed389ec7..34f9972a1 100644
--- a/site/community.html
+++ b/site/community.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/documentation.html b/site/documentation.html
index 1051b9d2c..68d55de16 100644
--- a/site/documentation.html
+++ b/site/documentation.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/downloads.html b/site/downloads.html
index d8bc780bc..031176a35 100644
--- a/site/downloads.html
+++ b/site/downloads.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/examples.html b/site/examples.html
index a10741676..f76965748 100644
--- a/site/examples.html
+++ b/site/examples.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
@@ -167,254 +170,409 @@
</div>
<div class="col-md-9 col-md-pull-3">
- <h2>Spark Examples</h2>
+ <h1>Spark Examples</h1>
<p>These examples give a quick overview of the Spark API.
Spark is built on the concept of <em>distributed datasets</em>, which contain arbitrary Java or
Python objects. You create a dataset from external data, then apply parallel operations
-to it. There are two types of operations: <em>transformations</em>, which define a new dataset based on
-previous ones, and <em>actions</em>, which kick off a job to execute on a cluster.</p>
-
-<h3>Text Search</h3>
+to it. The building block of the Spark API is its <a href="http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds">RDD API</a>.
+In the RDD API,
+there are two types of operations: <em>transformations</em>, which define a new dataset based on previous ones,
+and <em>actions</em>, which kick off a job to execute on a cluster.
+On top of Spark’s RDD API, high level APIs are provided, e.g.
+<a href="http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes">DataFrame API</a> and
+<a href="http://spark.apache.org/docs/latest/mllib-guide.html">Machine Learning API</a>.
+These high level APIs provide a concise way to conduct certain data operations.
+In this page, we will show examples using RDD API as well as examples using high level APIs.</p>
+
+<h2>RDD API Examples</h2>
-<p>In this example, we search through the error messages in a log file:</p>
+<h3>Word Count</h3>
+<p>In this example, we use a few transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
- errors = text_file.<span class="sparkop">filter</span>(<span class="closure">lambda line: "ERROR" in line</span>)<br />
- <span class="comment"># Count all the errors</span><br />
- errors.<span class="sparkop">count</span>()<br />
- <span class="comment"># Count errors mentioning MySQL</span><br />
- errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">count</span>()<br />
- <span class="comment"># Fetch the MySQL errors as an array of strings</span><br />
- errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">collect</span>()<br />
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> textFile = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
- <span class="keyword">val</span> errors = textFile.<span class="sparkop">filter</span>(<span class="closure">line =&gt; line.contains("ERROR")</span>)<br />
- <span class="comment">// Count all the errors</span><br />
- errors.<span class="sparkop">count</span>()<br />
- <span class="comment">// Count errors mentioning MySQL</span><br />
- errors.<span class="sparkop">filter</span>(<span class="closure">line =&gt; line.contains("MySQL")</span>).<span class="sparkop">count</span>()<br />
- <span class="comment">// Fetch the MySQL errors as an array of strings</span><br />
- errors.<span class="sparkop">filter</span>(<span class="closure">line =&gt; line.contains("MySQL")</span>).<span class="sparkop">collect</span>()<br />
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- JavaRDD&lt;String&gt; textFile = spark.textFile(<span class="string">"hdfs://..."</span>);<br />
- JavaRDD&lt;String&gt; errors = textFile.<span class="sparkop">filter</span>(<span class="closure">new Function&lt;String, Boolean&gt;() {<br />
- &nbsp;&nbsp;public Boolean call(String s) { return s.contains("ERROR"); }<br />
- }</span>);<br />
- <span class="comment">// Count all the errors</span><br />
- errors.<span class="sparkop">count</span>();<br />
- <span class="comment">// Count errors mentioning MySQL</span><br />
- errors.<span class="sparkop">filter</span>(<span class="closure">new Function&lt;String, Boolean&gt;() {<br />
- &nbsp;&nbsp;public Boolean call(String s) { return s.contains("MySQL"); }<br />
- }</span>).<span class="sparkop">count</span>();<br />
- <span class="comment">// Fetch the MySQL errors as an array of strings</span><br />
- errors.<span class="sparkop">filter</span>(<span class="closure">new Function&lt;String, Boolean&gt;() {<br />
- &nbsp;&nbsp;public Boolean call(String s) { return s.contains("MySQL"); }<br />
- }</span>).<span class="sparkop">collect</span>();<br />
- </div>
- </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">text_file</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="p">)</span>
+<span class="n">counts</span> <span class="o">=</span> <span class="n">text_file</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">line</span><span class="p">:</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot; &quot;</span><span class="p">))</span> \
+ <span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">word</span><span class="p">:</span> <span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span> \
+ <span class="o">.</span><span class="n">reduceByKey</span><span class="p">(</span><span class="k">lambda</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">:</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="p">)</span>
+<span class="n">counts</span><span class="o">.</span><span class="n">saveAsTextFile</span><span class="p">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="p">)</span></code></pre></div>
+
</div>
+</div>
+
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
-<p>The red code fragments are function literals (closures) that get passed automatically to the cluster. The blue ones are Spark operations.</p>
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">textFile</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">counts</span> <span class="k">=</span> <span class="n">textFile</span><span class="o">.</span><span class="n">flatMap</span><span class="o">(</span><span class="n">line</span> <span class="k">=&gt;</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">))</span>
+ <span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">word</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="n">word</span><span class="o">,</span> <span class="mi">1</span><span class="o">))</span>
+ <span class="o">.</span><span class="n">reduceByKey</span><span class="o">(</span><span class="k">_</span> <span class="o">+</span> <span class="k">_</span><span class="o">)</span>
+<span class="n">counts</span><span class="o">.</span><span class="n">saveAsTextFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">)</span></code></pre></div>
-<h3>In-Memory Text Search</h3>
+</div>
+</div>
-<p>Spark can <em>cache</em> datasets in memory to speed up reuse. In the example above, we can load just the error messages in RAM using:</p>
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">textFile</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">words</span> <span class="o">=</span> <span class="n">textFile</span><span class="o">.</span><span class="na">flatMap</span><span class="o">(</span><span class="k">new</span> <span class="n">FlatMapFunction</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Iterable</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">{</span> <span class="k">return</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="n">s</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">));</span> <span class="o">}</span>
+<span class="o">});</span>
+<span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="n">pairs</span> <span class="o">=</span> <span class="n">words</span><span class="o">.</span><span class="na">mapToPair</span><span class="o">(</span><span class="k">new</span> <span class="n">PairFunction</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">{</span> <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;(</span><span class="n">s</span><span class="o">,</span> <span class="mi">1</span><span class="o">);</span> <span class="o">}</span>
+<span class="o">});</span>
+<span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="n">counts</span> <span class="o">=</span> <span class="n">pairs</span><span class="o">.</span><span class="na">reduceByKey</span><span class="o">(</span><span class="k">new</span> <span class="n">Function2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Integer</span> <span class="nf">call</span><span class="o">(</span><span class="n">Integer</span> <span class="n">a</span><span class="o">,</span> <span class="n">Integer</span> <span class="n">b</span><span class="o">)</span> <span class="o">{</span> <span class="k">return</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="o">;</span> <span class="o">}</span>
+<span class="o">});</span>
+<span class="n">counts</span><span class="o">.</span><span class="na">saveAsTextFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">);</span></code></pre></div>
+
+</div>
+</div>
+</div>
+
+<h3>Pi Estimation</h3>
+<p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">π</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">π / 4</span>, so we use this to get our estimate.</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- errors.<span class="sparkop">cache</span>()
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- errors.<span class="sparkop">cache</span>()
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- errors.<span class="sparkop">cache</span>();
- </div>
- </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="k">def</span> <span class="nf">sample</span><span class="p">(</span><span class="n">p</span><span class="p">):</span>
+ <span class="n">x</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">random</span><span class="p">(),</span> <span class="n">random</span><span class="p">()</span>
+ <span class="k">return</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">x</span><span class="o">*</span><span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="o">*</span><span class="n">y</span> <span class="o">&lt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="mi">0</span>
+
+<span class="n">count</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">xrange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">NUM_SAMPLES</span><span class="p">))</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">sample</span><span class="p">)</span> \
+ <span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">:</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span><span class="p">)</span>
+<span class="k">print</span> <span class="s">&quot;Pi is roughly </span><span class="si">%f</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="mf">4.0</span> <span class="o">*</span> <span class="n">count</span> <span class="o">/</span> <span class="n">NUM_SAMPLES</span><span class="p">)</span></code></pre></div>
+
+</div>
</div>
-<p>After the first action that uses <code>errors</code>, later ones will be much faster.</p>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
-<h3>Word Count</h3>
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">count</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span><span class="mi">1</span> <span class="n">to</span> <span class="nc">NUM_SAMPLES</span><span class="o">).</span><span class="n">map</span><span class="o">{</span><span class="n">i</span> <span class="k">=&gt;</span>
+ <span class="k">val</span> <span class="n">x</span> <span class="k">=</span> <span class="nc">Math</span><span class="o">.</span><span class="n">random</span><span class="o">()</span>
+ <span class="k">val</span> <span class="n">y</span> <span class="k">=</span> <span class="nc">Math</span><span class="o">.</span><span class="n">random</span><span class="o">()</span>
+ <span class="k">if</span> <span class="o">(</span><span class="n">x</span><span class="o">*</span><span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="o">*</span><span class="n">y</span> <span class="o">&lt;</span> <span class="mi">1</span><span class="o">)</span> <span class="mi">1</span> <span class="k">else</span> <span class="mi">0</span>
+<span class="o">}.</span><span class="n">reduce</span><span class="o">(</span><span class="k">_</span> <span class="o">+</span> <span class="k">_</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="s">&quot;Pi is roughly &quot;</span> <span class="o">+</span> <span class="mf">4.0</span> <span class="o">*</span> <span class="n">count</span> <span class="o">/</span> <span class="nc">NUM_SAMPLES</span><span class="o">)</span></code></pre></div>
+
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span> <span class="n">l</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;(</span><span class="n">NUM_SAMPLES</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">NUM_SAMPLES</span><span class="o">;</span> <span class="n">i</span><span class="o">++)</span> <span class="o">{</span>
+ <span class="n">l</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">i</span><span class="o">);</span>
+<span class="o">}</span>
+
+<span class="kt">long</span> <span class="n">count</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">l</span><span class="o">).</span><span class="na">filter</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Boolean</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Boolean</span> <span class="nf">call</span><span class="o">(</span><span class="n">Integer</span> <span class="n">i</span><span class="o">)</span> <span class="o">{</span>
+ <span class="kt">double</span> <span class="n">x</span> <span class="o">=</span> <span class="n">Math</span><span class="o">.</span><span class="na">random</span><span class="o">();</span>
+ <span class="kt">double</span> <span class="n">y</span> <span class="o">=</span> <span class="n">Math</span><span class="o">.</span><span class="na">random</span><span class="o">();</span>
+ <span class="k">return</span> <span class="n">x</span><span class="o">*</span><span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="o">*</span><span class="n">y</span> <span class="o">&lt;</span> <span class="mi">1</span><span class="o">;</span>
+ <span class="o">}</span>
+<span class="o">}).</span><span class="na">count</span><span class="o">();</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Pi is roughly &quot;</span> <span class="o">+</span> <span class="mf">4.0</span> <span class="o">*</span> <span class="n">count</span> <span class="o">/</span> <span class="n">NUM_SAMPLES</span><span class="o">);</span></code></pre></div>
-<p>In this example, we use a few more transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p>
+</div>
+</div>
+</div>
+
+<h2>DataFrame API Examples</h2>
+<p>
+In Spark, a <a href="http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes">DataFrame</a>
+is a distributed collection of data organized into named columns.
+Users can use DataFrame API to perform various relational operations on both external
+data sources and Spark’s built-in distributed collections without providing specific procedures for processing data.
+Also, programs based on DataFrame API will be automatically optimized by Spark’s built-in optimizer, Catalyst.
+</p>
+
+<h3>Text Search</h3>
+<p>In this example, we search through the error messages in a log file.</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
- counts = text_file.<span class="sparkop">flatMap</span>(<span class="closure">lambda line: line.split(" ")</span>) \<br />
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">map</span>(<span class="closure">lambda word: (word, 1)</span>) \<br />
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduceByKey</span>(<span class="closure">lambda a, b: a + b</span>)<br />
- counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>)
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> textFile = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
- <span class="keyword">val</span> counts = textFile.<span class="sparkop">flatMap</span>(<span class="closure">line =&gt; line.split(" ")</span>)<br />
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">map</span>(<span class="closure">word =&gt; (word, 1)</span>)<br />
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)<br />
- counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>)
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- JavaRDD&lt;String&gt; textFile = spark.textFile(<span class="string">"hdfs://..."</span>);<br />
- JavaRDD&lt;String&gt; words = textFile.<span class="sparkop">flatMap</span>(<span class="closure">new FlatMapFunction&lt;String, String&gt;() {<br />
- &nbsp;&nbsp;public Iterable&lt;String&gt; call(String s) { return Arrays.asList(s.split(" ")); }<br />
- }</span>);<br />
- JavaPairRDD&lt;String, Integer&gt; pairs = words.<span class="sparkop">mapToPair</span>(<span class="closure">new PairFunction&lt;String, String, Integer&gt;() {<br />
- &nbsp;&nbsp;public Tuple2&lt;String, Integer&gt; call(String s) { return new Tuple2&lt;String, Integer&gt;(s, 1); }<br />
- }</span>);<br />
- JavaPairRDD&lt;String, Integer&gt; counts = pairs.<span class="sparkop">reduceByKey</span>(<span class="closure">new Function2&lt;Integer, Integer, Integer&gt;() {<br />
- &nbsp;&nbsp;public Integer call(Integer a, Integer b) { return a + b; }<br />
- }</span>);<br />
- counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>);
- </div>
- </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">textFile</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="p">)</span>
+
+<span class="c"># Creates a DataFrame having a single column named &quot;line&quot;</span>
+<span class="n">df</span> <span class="o">=</span> <span class="n">textFile</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">r</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">r</span><span class="p">))</span><span class="o">.</span><span class="n">toDF</span><span class="p">([</span><span class="s">&quot;line&quot;</span><span class="p">])</span>
+<span class="n">errors</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">&quot;line&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">like</span><span class="p">(</span><span class="s">&quot;</span><span class="si">%E</span><span class="s">RROR%&quot;</span><span class="p">))</span>
+<span class="c"># Counts all the errors</span>
+<span class="n">errors</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
+<span class="c"># Counts errors mentioning MySQL</span>
+<span class="n">errors</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">&quot;line&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">like</span><span class="p">(</span><span class="s">&quot;%MySQL%&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
+<span class="c"># Fetches the MySQL errors as an array of strings</span>
+<span class="n">errors</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">&quot;line&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">like</span><span class="p">(</span><span class="s">&quot;%MySQL%&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span></code></pre></div>
+
+</div>
</div>
-<h3>Estimating Pi</h3>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
-<p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">π</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">π / 4</span>, so we use this to get our estimate.</p>
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">textFile</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">)</span>
+
+<span class="c1">// Creates a DataFrame having a single column named &quot;line&quot;</span>
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">textFile</span><span class="o">.</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">errors</span> <span class="k">=</span> <span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">).</span><span class="n">like</span><span class="o">(</span><span class="s">&quot;%ERROR%&quot;</span><span class="o">))</span>
+<span class="c1">// Counts all the errors</span>
+<span class="n">errors</span><span class="o">.</span><span class="n">count</span><span class="o">()</span>
+<span class="c1">// Counts errors mentioning MySQL</span>
+<span class="n">errors</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">).</span><span class="n">like</span><span class="o">(</span><span class="s">&quot;%MySQL%&quot;</span><span class="o">)).</span><span class="n">count</span><span class="o">()</span>
+<span class="c1">// Fetches the MySQL errors as an array of strings</span>
+<span class="n">errors</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">).</span><span class="n">like</span><span class="o">(</span><span class="s">&quot;%MySQL%&quot;</span><span class="o">)).</span><span class="n">collect</span><span class="o">()</span></code></pre></div>
+
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// Creates a DataFrame having a single column named &quot;line&quot;</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">textFile</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">&quot;hdfs://...&quot;</span><span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">rowRDD</span> <span class="o">=</span> <span class="n">textFile</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Row</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Row</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">line</span><span class="o">)</span> <span class="kd">throws</span> <span class="n">Exception</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">line</span><span class="o">);</span>
+ <span class="o">}</span>
+ <span class="o">});</span>
+<span class="n">List</span><span class="o">&lt;</span><span class="n">StructField</span><span class="o">&gt;</span> <span class="n">fields</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o">&lt;</span><span class="n">StructField</span><span class="o">&gt;();</span>
+<span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructType</span><span class="o">(</span><span class="n">fields</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rowRDD</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="n">DataFrame</span> <span class="n">errors</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="na">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">).</span><span class="na">like</span><span class="o">(</span><span class="s">&quot;%ERROR%&quot;</span><span class="o">));</span>
+<span class="c1">// Counts all the errors</span>
+<span class="n">errors</span><span class="o">.</span><span class="na">count</span><span class="o">();</span>
+<span class="c1">// Counts errors mentioning MySQL</span>
+<span class="n">errors</span><span class="o">.</span><span class="na">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">).</span><span class="na">like</span><span class="o">(</span><span class="s">&quot;%MySQL%&quot;</span><span class="o">)).</span><span class="na">count</span><span class="o">();</span>
+<span class="c1">// Fetches the MySQL errors as an array of strings</span>
+<span class="n">errors</span><span class="o">.</span><span class="na">filter</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;line&quot;</span><span class="o">).</span><span class="na">like</span><span class="o">(</span><span class="s">&quot;%MySQL%&quot;</span><span class="o">)).</span><span class="na">collect</span><span class="o">();</span></code></pre></div>
+
+</div>
+</div>
+</div>
+
+<h3>Simple Data Operations</h3>
+<p>
+In this example, we read a table stored in a database and calculate the number of people for every age.
+Finally, we save the calculated result to S3 in the format of JSON.
+A simple MySQL table "people" is used in the example and this table has two columns,
+"name" and "age".
+</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- <span class="keyword">def</span> sample(p):<br />
- &nbsp;&nbsp;&nbsp;&nbsp;x, y = random(), random()<br />
- &nbsp;&nbsp;&nbsp;&nbsp;<span class="keyword">return</span> 1 <span class="keyword">if</span> x*x + y*y &lt; 1 <span class="keyword">else</span> 0<br /><br />
- count = spark.parallelize(xrange(0, NUM_SAMPLES)).<span class="sparkop">map</span>(<span class="closure">sample</span>) \<br />
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br />
- print <span class="string">"Pi is roughly %f"</span> % (4.0 * count / NUM_SAMPLES)<br />
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> count = spark.parallelize(1 to NUM_SAMPLES).<span class="sparkop">map</span>{<span class="closure">i =&gt;<br />
- &nbsp;&nbsp;val x = Math.random()<br />
- &nbsp;&nbsp;val y = Math.random()<br />
- &nbsp;&nbsp;if (x*x + y*y &lt; 1) 1 else 0<br />
- </span>}.<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br />
- println(<span class="string">"Pi is roughly "</span> + 4.0 * count / NUM_SAMPLES)<br />
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- <span class="keyword">int</span> count = spark.parallelize(makeRange(1, NUM_SAMPLES)).<span class="sparkop">filter</span>(<span class="closure">new Function&lt;Integer, Boolean&gt;() {<br />
- &nbsp;&nbsp;public Boolean call(Integer i) {<br />
- &nbsp;&nbsp;&nbsp;&nbsp;double x = Math.random();<br />
- &nbsp;&nbsp;&nbsp;&nbsp;double y = Math.random();<br />
- &nbsp;&nbsp;&nbsp;&nbsp;return x*x + y*y &lt; 1;<br />
- &nbsp;&nbsp;}<br />
- }</span>).<span class="sparkop">count</span>();<br />
- System.out.println(<span class="string">"Pi is roughly "</span> + 4 * count / NUM_SAMPLES);<br />
- </div>
- </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># Creates a DataFrame based on a table named &quot;people&quot;</span>
+<span class="c"># stored in a MySQL database.</span>
+<span class="n">url</span> <span class="o">=</span> \
+ <span class="s">&quot;jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword&quot;</span>
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span> \
+ <span class="o">.</span><span class="n">read</span> \
+ <span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">&quot;jdbc&quot;</span><span class="p">)</span> \
+ <span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s">&quot;url&quot;</span><span class="p">,</span> <span class="n">url</span><span class="p">)</span> \
+ <span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s">&quot;dbtable&quot;</span><span class="p">,</span> <span class="s">&quot;people&quot;</span><span class="p">)</span> \
+ <span class="o">.</span><span class="n">load</span><span class="p">()</span>
+
+<span class="c"># Looks the schema of this DataFrame.</span>
+<span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
+
+<span class="c"># Counts people by age</span>
+<span class="n">countsByAge</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">&quot;age&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
+<span class="n">countsByAge</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
+
+<span class="c"># Saves countsByAge to S3 in the JSON format.</span>
+<span class="n">countsByAge</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">&quot;json&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s">&quot;s3a://...&quot;</span><span class="p">)</span></code></pre></div>
+
+</div>
</div>
-<h3>Logistic Regression</h3>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Creates a DataFrame based on a table named &quot;people&quot;</span>
+<span class="c1">// stored in a MySQL database.</span>
+<span class="k">val</span> <span class="n">url</span> <span class="k">=</span>
+ <span class="s">&quot;jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword&quot;</span>
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span>
+ <span class="o">.</span><span class="n">read</span>
+ <span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">&quot;jdbc&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">option</span><span class="o">(</span><span class="s">&quot;url&quot;</span><span class="o">,</span> <span class="n">url</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">option</span><span class="o">(</span><span class="s">&quot;dbtable&quot;</span><span class="o">,</span> <span class="s">&quot;people&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">load</span><span class="o">()</span>
+
+<span class="c1">// Looks the schema of this DataFrame.</span>
+<span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="o">()</span>
-<p>This is an iterative machine learning algorithm that seeks to find the best hyperplane that separates two sets of points in a multi-dimensional feature space. It can be used to classify messages into spam vs non-spam, for example. Because the algorithm applies the same MapReduce operation repeatedly to the same dataset, it benefits greatly from caching the input in RAM across iterations.</p>
+<span class="c1">// Counts people by age</span>
+<span class="k">val</span> <span class="n">countsByAge</span> <span class="k">=</span> <span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">).</span><span class="n">count</span><span class="o">()</span>
+<span class="n">countsByAge</span><span class="o">.</span><span class="n">show</span><span class="o">()</span>
+
+<span class="c1">// Saves countsByAge to S3 in the JSON format.</span>
+<span class="n">countsByAge</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">&quot;json&quot;</span><span class="o">).</span><span class="n">save</span><span class="o">(</span><span class="s">&quot;s3a://...&quot;</span><span class="o">)</span></code></pre></div>
+
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// Creates a DataFrame based on a table named &quot;people&quot;</span>
+<span class="c1">// stored in a MySQL database.</span>
+<span class="n">String</span> <span class="n">url</span> <span class="o">=</span>
+ <span class="s">&quot;jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword&quot;</span><span class="o">;</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span>
+ <span class="o">.</span><span class="na">read</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;jdbc&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">&quot;url&quot;</span><span class="o">,</span> <span class="n">url</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">&quot;dbtable&quot;</span><span class="o">,</span> <span class="s">&quot;people&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">load</span><span class="o">();</span>
+
+<span class="c1">// Looks the schema of this DataFrame.</span>
+<span class="n">df</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span>
+
+<span class="c1">// Counts people by age</span>
+<span class="n">DataFrame</span> <span class="n">countsByAge</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">).</span><span class="na">count</span><span class="o">();</span>
+<span class="n">countsByAge</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
+
+<span class="c1">// Saves countsByAge to S3 in the JSON format.</span>
+<span class="n">countsByAge</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;json&quot;</span><span class="o">).</span><span class="na">save</span><span class="o">(</span><span class="s">&quot;s3a://...&quot;</span><span class="o">);</span></code></pre></div>
+
+</div>
+</div>
+</div>
+
+<h2>Machine Learning Example</h2>
+<p>
+<a href="http://spark.apache.org/docs/latest/mllib-guide.html">MLlib</a>, Spark’s Machine Learning (ML) library, provides many distributed ML algorithms.
+These algorithms cover tasks such as feature extraction, classification, regression, clustering,
+recommendation, and more.
+MLlib also provides tools such as ML Pipelines for building workflows, CrossValidator for tuning parameters,
+and model persistence for saving and loading models.
+</p>
+
+<h3>Prediction with Logistic Regression</h3>
+<p>
+In this example, we take a dataset of labels and feature vectors.
+We learn to predict the labels from feature vectors using the Logistic Regression algorithm.
+</p>
<ul class="nav nav-tabs">
<li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
<li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
<li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
</ul>
+
<div class="tab-content">
- <div class="tab-pane tab-pane-python active">
- <div class="code code-tab">
- points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br />
- w = numpy.random.ranf(size = D) <span class="comment"># current separating plane</span><br />
- <span class="keyword">for</span> i <span class="keyword">in</span> range(ITERATIONS):<br />
- &nbsp;&nbsp;&nbsp;&nbsp;gradient = points.<span class="sparkop">map</span>(<span class="closure"><br />
- &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;lambda p: (1 / (1 + exp(-p.y*(w.dot(p.x)))) - 1) * p.y * p.x<br />
- &nbsp;&nbsp;&nbsp;&nbsp;</span>).<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br />
- &nbsp;&nbsp;&nbsp;&nbsp;w -= gradient<br />
- print <span class="string">"Final separating plane: %s"</span> % w<br />
- </div>
- </div>
- <div class="tab-pane tab-pane-scala">
- <div class="code code-tab">
- <span class="keyword">val</span> points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br />
- <span class="keyword">var</span> w = Vector.random(D) <span class="comment">// current separating plane</span><br />
- <span class="keyword">for</span> (i &lt;- 1 to ITERATIONS) {<br />
- &nbsp;&nbsp;<span class="keyword">val</span> gradient = points.<span class="sparkop">map</span>(<span class="closure">p =&gt;<br />
- &nbsp;&nbsp;&nbsp;&nbsp;(1 / (1 + exp(-p.y*(w dot p.x))) - 1) * p.y * p.x<br />
- &nbsp;&nbsp;</span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br />
- &nbsp;&nbsp;w -= gradient<br />
- }<br />
- println(<span class="string">"Final separating plane: "</span> + w)<br />
- </div>
- </div>
- <div class="tab-pane tab-pane-java">
- <div class="code code-tab">
- <span class="keyword">class</span> ComputeGradient <span class="keyword">extends</span> Function&lt;DataPoint, Vector&gt; {<br />
- &nbsp;&nbsp;<span class="keyword">private</span> Vector w;<br />
- &nbsp;&nbsp;ComputeGradient(Vector w) { <span class="keyword">this</span>.w = w; }<br />
- &nbsp;&nbsp;<span class="keyword">public</span> Vector call(DataPoint p) {<br />
- &nbsp;&nbsp;&nbsp;&nbsp;<span class="keyword">return</span> p.x.times(p.y * (1 / (1 + Math.exp(w.dot(p.x))) - 1));<br />
- &nbsp;&nbsp;}<br />
- }<br />
- <br />
- JavaRDD&lt;DataPoint&gt; points = spark.textFile(...).<span class="sparkop">map</span>(<span class="closure">new ParsePoint()</span>).<span class="sparkop">cache</span>();<br />
- Vector w = Vector.random(D); <span class="comment">// current separating plane</span><br />
- <span class="keyword">for</span> (<span class="keyword">int</span> i = 0; i &lt; ITERATIONS; i++) {<br />
- &nbsp;&nbsp;Vector gradient = points.<span class="sparkop">map</span>(<span class="closure">new ComputeGradient(w)</span>).<span class="sparkop">reduce</span>(<span class="closure">new AddVectors()</span>);<br />
- &nbsp;&nbsp;w = w.subtract(gradient);<br />
- }<br />
- System.out.println(<span class="string">"Final separating plane: "</span> + w);<br />
- </div>
- </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># Every record of this DataFrame contains the label and</span>
+<span class="c"># features represented by a vector.</span>
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">&quot;label&quot;</span><span class="p">,</span> <span class="s">&quot;features&quot;</span><span class="p">])</span>
+
+<span class="c"># Set parameters for the algorithm.</span>
+<span class="c"># Here, we limit the number of iterations to 10.</span>
+<span class="n">lr</span> <span class="o">=</span> <span class="n">LogisticRegression</span><span class="p">(</span><span class="n">maxIter</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
+
+<span class="c"># Fit the model to the data.</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
+
+<span class="c"># Given a dataset, predict each point&#39;s label, and show the results.</span>
+<span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>
+
+</div>
</div>
-<p>Note that the current separating plane, <code>w</code>, gets shipped automatically to the cluster with every <code>map</code> call.</p>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
-<p>The graph below compares the running time per iteration of this Spark program against a Hadoop implementation on 100 GB of data on a 100-node cluster, showing the benefit of in-memory caching:</p>
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Every record of this DataFrame contains the label and</span>
+<span class="c1">// features represented by a vector.</span>
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="s">&quot;features&quot;</span><span class="o">)</span>
-<p style="margin-top: 20px; margin-bottom: 30px;">
-<img src="/images/logistic-regression.png" alt="Logistic regression performance in Spark vs Hadoop" />
-</p>
+<span class="c1">// Set parameters for the algorithm.</span>
+<span class="c1">// Here, we limit the number of iterations to 10.</span>
+<span class="k">val</span> <span class="n">lr</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegression</span><span class="o">().</span><span class="n">setMaxIter</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
+
+<span class="c1">// Fit the model to the data.</span>
+<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">lr</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+
+<span class="c1">// Inspect the model: get the feature weights.</span>
+<span class="k">val</span> <span class="n">weights</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">weights</span>
+
+<span class="c1">// Given a dataset, predict each point&#39;s label, and show the results.</span>
+<span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// Every record of this DataFrame contains the label and</span>
+<span class="c1">// features represented by a vector.</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="c1">// Set parameters for the algorithm.</span>
+<span class="c1">// Here, we limit the number of iterations to 10.</span>
+<span class="n">LogisticRegression</span> <span class="n">lr</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">LogisticRegression</span><span class="o">().</span><span class="na">setMaxIter</span><span class="o">(</span><span class="mi">10</span><span class="o">);</span>
+
+<span class="c1">// Fit the model to the data.</span>
+<span class="n">LogisticRegressionModel</span> <span class="n">model</span> <span class="o">=</span> <span class="n">lr</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+
+<span class="c1">// Inspect the model: get the feature weights.</span>
+<span class="n">Vector</span> <span class="n">weights</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">weights</span><span class="o">();</span>
+
+<span class="c1">// Given a dataset, predict each point&#39;s label, and show the results.</span>
+<span class="n">model</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+</div>
+</div>
+</div>
<p><a name="additional"></a></p>
-<h2>Additional Examples</h2>
+<h1>Additional Examples</h1>
<p>Many additional examples are distributed with Spark:</p>
@@ -423,7 +581,6 @@ previous ones, and <em>actions</em>, which kick off a job to execute on a cluste
<li>Spark Streaming: <a href="https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming">Scala examples</a>, <a href="https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming">Java examples</a></li>
</ul>
-
</div>
</div>
diff --git a/site/faq.html b/site/faq.html
index 685b2c453..8a1247099 100644
--- a/site/faq.html
+++ b/site/faq.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/graphx/index.html b/site/graphx/index.html
index 8a5165d3e..34a7ea8cc 100644
--- a/site/graphx/index.html
+++ b/site/graphx/index.html
@@ -20,6 +20,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/index.html b/site/index.html
index e29e50bde..7a826f6fe 100644
--- a/site/index.html
+++ b/site/index.html
@@ -20,6 +20,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/mailing-lists.html b/site/mailing-lists.html
index 62b00e0dd..96a3ae027 100644
--- a/site/mailing-lists.html
+++ b/site/mailing-lists.html
@@ -21,6 +21,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/mllib/index.html b/site/mllib/index.html
index 79383ba3b..a4cb9a327 100644
--- a/site/mllib/index.html
+++ b/site/mllib/index.html
@@ -20,6 +20,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/amp-camp-2013-registration-ope.html b/site/news/amp-camp-2013-registration-ope.html
index d39e8b372..5b572a22a 100644
--- a/site/news/amp-camp-2013-registration-ope.html
+++ b/site/news/amp-camp-2013-registration-ope.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/announcing-the-first-spark-summit.html b/site/news/announcing-the-first-spark-summit.html
index ec65b8cf1..8323cd57e 100644
--- a/site/news/announcing-the-first-spark-summit.html
+++ b/site/news/announcing-the-first-spark-summit.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/fourth-spark-screencast-published.html b/site/news/fourth-spark-screencast-published.html
index 9e73da764..49479c391 100644
--- a/site/news/fourth-spark-screencast-published.html
+++ b/site/news/fourth-spark-screencast-published.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/index.html b/site/news/index.html
index b4cbef53f..16cb6c91b 100644
--- a/site/news/index.html
+++ b/site/news/index.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/nsdi-paper.html b/site/news/nsdi-paper.html
index f37727eda..7e185685d 100644
--- a/site/news/nsdi-paper.html
+++ b/site/news/nsdi-paper.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/one-month-to-spark-summit-2015.html b/site/news/one-month-to-spark-summit-2015.html
index 18e25fccb..48f0a5981 100644
--- a/site/news/one-month-to-spark-summit-2015.html
+++ b/site/news/one-month-to-spark-summit-2015.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/proposals-open-for-spark-summit-east.html b/site/news/proposals-open-for-spark-summit-east.html
index 4b03644d2..6eb82f7d2 100644
--- a/site/news/proposals-open-for-spark-summit-east.html
+++ b/site/news/proposals-open-for-spark-summit-east.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/registration-open-for-spark-summit-east.html b/site/news/registration-open-for-spark-summit-east.html
index c19703778..969f02037 100644
--- a/site/news/registration-open-for-spark-summit-east.html
+++ b/site/news/registration-open-for-spark-summit-east.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/run-spark-and-shark-on-amazon-emr.html b/site/news/run-spark-and-shark-on-amazon-emr.html
index d8eb3d938..557498a72 100644
--- a/site/news/run-spark-and-shark-on-amazon-emr.html
+++ b/site/news/run-spark-and-shark-on-amazon-emr.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-6-1-and-0-5-2-released.html b/site/news/spark-0-6-1-and-0-5-2-released.html
index 55ca076fb..48cd53eec 100644
--- a/site/news/spark-0-6-1-and-0-5-2-released.html
+++ b/site/news/spark-0-6-1-and-0-5-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-6-2-released.html b/site/news/spark-0-6-2-released.html
index 81b167490..5a36f2696 100644
--- a/site/news/spark-0-6-2-released.html
+++ b/site/news/spark-0-6-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-7-0-released.html b/site/news/spark-0-7-0-released.html
index 70dcf041f..f58ec11ec 100644
--- a/site/news/spark-0-7-0-released.html
+++ b/site/news/spark-0-7-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-7-2-released.html b/site/news/spark-0-7-2-released.html
index 93772c8eb..a045fcdc9 100644
--- a/site/news/spark-0-7-2-released.html
+++ b/site/news/spark-0-7-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-7-3-released.html b/site/news/spark-0-7-3-released.html
index 2d6838f8a..fb3298bc7 100644
--- a/site/news/spark-0-7-3-released.html
+++ b/site/news/spark-0-7-3-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-8-0-released.html b/site/news/spark-0-8-0-released.html
index ac6b4720f..4d6436aac 100644
--- a/site/news/spark-0-8-0-released.html
+++ b/site/news/spark-0-8-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-8-1-released.html b/site/news/spark-0-8-1-released.html
index b7f96f991..0a5c3b9be 100644
--- a/site/news/spark-0-8-1-released.html
+++ b/site/news/spark-0-8-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-9-0-released.html b/site/news/spark-0-9-0-released.html
index 62dd9324f..4f71ad47f 100644
--- a/site/news/spark-0-9-0-released.html
+++ b/site/news/spark-0-9-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-9-1-released.html b/site/news/spark-0-9-1-released.html
index ea635d20c..8557233e9 100644
--- a/site/news/spark-0-9-1-released.html
+++ b/site/news/spark-0-9-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-0-9-2-released.html b/site/news/spark-0-9-2-released.html
index 379e78c5e..8fa540d34 100644
--- a/site/news/spark-0-9-2-released.html
+++ b/site/news/spark-0-9-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-0-0-released.html b/site/news/spark-1-0-0-released.html
index f2f4f15a7..00c5e43cd 100644
--- a/site/news/spark-1-0-0-released.html
+++ b/site/news/spark-1-0-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-0-1-released.html b/site/news/spark-1-0-1-released.html
index 3b76a2ab3..bb50eac60 100644
--- a/site/news/spark-1-0-1-released.html
+++ b/site/news/spark-1-0-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-0-2-released.html b/site/news/spark-1-0-2-released.html
index b7958865c..739cf658f 100644
--- a/site/news/spark-1-0-2-released.html
+++ b/site/news/spark-1-0-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-1-0-released.html b/site/news/spark-1-1-0-released.html
index 86d5ddd63..727252872 100644
--- a/site/news/spark-1-1-0-released.html
+++ b/site/news/spark-1-1-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-1-1-released.html b/site/news/spark-1-1-1-released.html
index f2e75e5b2..b4161da76 100644
--- a/site/news/spark-1-1-1-released.html
+++ b/site/news/spark-1-1-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-2-0-released.html b/site/news/spark-1-2-0-released.html
index a0347c3f0..cd7bc10c2 100644
--- a/site/news/spark-1-2-0-released.html
+++ b/site/news/spark-1-2-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-2-1-released.html b/site/news/spark-1-2-1-released.html
index de48f8af0..ee14e9e58 100644
--- a/site/news/spark-1-2-1-released.html
+++ b/site/news/spark-1-2-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-2-2-released.html b/site/news/spark-1-2-2-released.html
index c64a7cac9..1dd6f4d83 100644
--- a/site/news/spark-1-2-2-released.html
+++ b/site/news/spark-1-2-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-3-0-released.html b/site/news/spark-1-3-0-released.html
index 06ae213f5..b8cd9a279 100644
--- a/site/news/spark-1-3-0-released.html
+++ b/site/news/spark-1-3-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-4-0-released.html b/site/news/spark-1-4-0-released.html
index 32d375139..c0deaea23 100644
--- a/site/news/spark-1-4-0-released.html
+++ b/site/news/spark-1-4-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-4-1-released.html b/site/news/spark-1-4-1-released.html
index e95376d13..fd99f87c1 100644
--- a/site/news/spark-1-4-1-released.html
+++ b/site/news/spark-1-4-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-5-0-released.html b/site/news/spark-1-5-0-released.html
index 3d48ec441..867567d34 100644
--- a/site/news/spark-1-5-0-released.html
+++ b/site/news/spark-1-5-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-5-1-released.html b/site/news/spark-1-5-1-released.html
index 74f3c8c7c..d97135f17 100644
--- a/site/news/spark-1-5-1-released.html
+++ b/site/news/spark-1-5-1-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-5-2-released.html b/site/news/spark-1-5-2-released.html
index 871189d67..d184dcb2e 100644
--- a/site/news/spark-1-5-2-released.html
+++ b/site/news/spark-1-5-2-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-1-6-0-released.html b/site/news/spark-1-6-0-released.html
index a267fda97..f4eace39d 100644
--- a/site/news/spark-1-6-0-released.html
+++ b/site/news/spark-1-6-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-accepted-into-apache-incubator.html b/site/news/spark-accepted-into-apache-incubator.html
index 4ed8668b7..73f23a2ce 100644
--- a/site/news/spark-accepted-into-apache-incubator.html
+++ b/site/news/spark-accepted-into-apache-incubator.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-and-shark-in-the-news.html b/site/news/spark-and-shark-in-the-news.html
index 139074c46..3155aa4a4 100644
--- a/site/news/spark-and-shark-in-the-news.html
+++ b/site/news/spark-and-shark-in-the-news.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-becomes-tlp.html b/site/news/spark-becomes-tlp.html
index e3908d1ef..d4a356674 100644
--- a/site/news/spark-becomes-tlp.html
+++ b/site/news/spark-becomes-tlp.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-featured-in-wired.html b/site/news/spark-featured-in-wired.html
index 0e07a330f..c194ae521 100644
--- a/site/news/spark-featured-in-wired.html
+++ b/site/news/spark-featured-in-wired.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-mailing-lists-moving-to-apache.html b/site/news/spark-mailing-lists-moving-to-apache.html
index 1d631022a..4fa0498ac 100644
--- a/site/news/spark-mailing-lists-moving-to-apache.html
+++ b/site/news/spark-mailing-lists-moving-to-apache.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-meetups.html b/site/news/spark-meetups.html
index e5808566a..4a36aad80 100644
--- a/site/news/spark-meetups.html
+++ b/site/news/spark-meetups.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-screencasts-published.html b/site/news/spark-screencasts-published.html
index 10dddea50..81b4904a8 100644
--- a/site/news/spark-screencasts-published.html
+++ b/site/news/spark-screencasts-published.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-2013-is-a-wrap.html b/site/news/spark-summit-2013-is-a-wrap.html
index 03e22e729..70ef69803 100644
--- a/site/news/spark-summit-2013-is-a-wrap.html
+++ b/site/news/spark-summit-2013-is-a-wrap.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-2014-videos-posted.html b/site/news/spark-summit-2014-videos-posted.html
index 9fde161fd..5e2423dff 100644
--- a/site/news/spark-summit-2014-videos-posted.html
+++ b/site/news/spark-summit-2014-videos-posted.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-2015-videos-posted.html b/site/news/spark-summit-2015-videos-posted.html
index 75f465a28..d4b11694d 100644
--- a/site/news/spark-summit-2015-videos-posted.html
+++ b/site/news/spark-summit-2015-videos-posted.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-agenda-posted.html b/site/news/spark-summit-agenda-posted.html
index 7baae0048..47b31a201 100644
--- a/site/news/spark-summit-agenda-posted.html
+++ b/site/news/spark-summit-agenda-posted.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-east-2015-videos-posted.html b/site/news/spark-summit-east-2015-videos-posted.html
index b2c93e64d..c90d88463 100644
--- a/site/news/spark-summit-east-2015-videos-posted.html
+++ b/site/news/spark-summit-east-2015-videos-posted.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-east-2016-cfp-closing.html b/site/news/spark-summit-east-2016-cfp-closing.html
index 8c7a32de6..2d1a055b9 100644
--- a/site/news/spark-summit-east-2016-cfp-closing.html
+++ b/site/news/spark-summit-east-2016-cfp-closing.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-east-agenda-posted.html b/site/news/spark-summit-east-agenda-posted.html
index 84f5a6c4e..da23fa89a 100644
--- a/site/news/spark-summit-east-agenda-posted.html
+++ b/site/news/spark-summit-east-agenda-posted.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-europe-agenda-posted.html b/site/news/spark-summit-europe-agenda-posted.html
index e835d3605..12282c4bb 100644
--- a/site/news/spark-summit-europe-agenda-posted.html
+++ b/site/news/spark-summit-europe-agenda-posted.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-summit-europe.html b/site/news/spark-summit-europe.html
index c2a1f9352..452ff8f13 100644
--- a/site/news/spark-summit-europe.html
+++ b/site/news/spark-summit-europe.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-tips-from-quantifind.html b/site/news/spark-tips-from-quantifind.html
index e52bf2237..c7572fd3b 100644
--- a/site/news/spark-tips-from-quantifind.html
+++ b/site/news/spark-tips-from-quantifind.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-user-survey-and-powered-by-page.html b/site/news/spark-user-survey-and-powered-by-page.html
index 20c410dcb..860a89f36 100644
--- a/site/news/spark-user-survey-and-powered-by-page.html
+++ b/site/news/spark-user-survey-and-powered-by-page.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-version-0-6-0-released.html b/site/news/spark-version-0-6-0-released.html
index 1a1a2ac33..aad305c50 100644
--- a/site/news/spark-version-0-6-0-released.html
+++ b/site/news/spark-version-0-6-0-released.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html b/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html
index 269c99b3c..504759ed8 100644
--- a/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html
+++ b/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/strata-exercises-now-available-online.html b/site/news/strata-exercises-now-available-online.html
index 326ea8959..5a8a66302 100644
--- a/site/news/strata-exercises-now-available-online.html
+++ b/site/news/strata-exercises-now-available-online.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/submit-talks-to-spark-summit-2014.html b/site/news/submit-talks-to-spark-summit-2014.html
index 3c474c33a..0c6468972 100644
--- a/site/news/submit-talks-to-spark-summit-2014.html
+++ b/site/news/submit-talks-to-spark-summit-2014.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/submit-talks-to-spark-summit-east-2016.html b/site/news/submit-talks-to-spark-summit-east-2016.html
index e8ba4dc45..172a36fba 100644
--- a/site/news/submit-talks-to-spark-summit-east-2016.html
+++ b/site/news/submit-talks-to-spark-summit-east-2016.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/two-weeks-to-spark-summit-2014.html b/site/news/two-weeks-to-spark-summit-2014.html
index c76f28f26..3a9bde074 100644
--- a/site/news/two-weeks-to-spark-summit-2014.html
+++ b/site/news/two-weeks-to-spark-summit-2014.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/news/video-from-first-spark-development-meetup.html b/site/news/video-from-first-spark-development-meetup.html
index 78b1f05f5..ba8820eaa 100644
--- a/site/news/video-from-first-spark-development-meetup.html
+++ b/site/news/video-from-first-spark-development-meetup.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-3.html b/site/releases/spark-release-0-3.html
index be4bfdbf5..56fc21e98 100644
--- a/site/releases/spark-release-0-3.html
+++ b/site/releases/spark-release-0-3.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-5-0.html b/site/releases/spark-release-0-5-0.html
index 43fcc4202..ebbcc68c1 100644
--- a/site/releases/spark-release-0-5-0.html
+++ b/site/releases/spark-release-0-5-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-5-1.html b/site/releases/spark-release-0-5-1.html
index 789472f87..2e6ebc8e4 100644
--- a/site/releases/spark-release-0-5-1.html
+++ b/site/releases/spark-release-0-5-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-5-2.html b/site/releases/spark-release-0-5-2.html
index 865843853..52f556f53 100644
--- a/site/releases/spark-release-0-5-2.html
+++ b/site/releases/spark-release-0-5-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-6-0.html b/site/releases/spark-release-0-6-0.html
index aa1312d1f..526b2d13c 100644
--- a/site/releases/spark-release-0-6-0.html
+++ b/site/releases/spark-release-0-6-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-6-1.html b/site/releases/spark-release-0-6-1.html
index 67de9165a..ce181f317 100644
--- a/site/releases/spark-release-0-6-1.html
+++ b/site/releases/spark-release-0-6-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-6-2.html b/site/releases/spark-release-0-6-2.html
index 74c5ec70f..e2c21a4c2 100644
--- a/site/releases/spark-release-0-6-2.html
+++ b/site/releases/spark-release-0-6-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-7-0.html b/site/releases/spark-release-0-7-0.html
index ecf829bc5..2a5cc8c5d 100644
--- a/site/releases/spark-release-0-7-0.html
+++ b/site/releases/spark-release-0-7-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-7-2.html b/site/releases/spark-release-0-7-2.html
index f9e7d5d31..bd1854ba8 100644
--- a/site/releases/spark-release-0-7-2.html
+++ b/site/releases/spark-release-0-7-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-7-3.html b/site/releases/spark-release-0-7-3.html
index 31108ad1e..2c012cd4a 100644
--- a/site/releases/spark-release-0-7-3.html
+++ b/site/releases/spark-release-0-7-3.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-8-0.html b/site/releases/spark-release-0-8-0.html
index 8e964b1a9..b48cd4bcf 100644
--- a/site/releases/spark-release-0-8-0.html
+++ b/site/releases/spark-release-0-8-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-8-1.html b/site/releases/spark-release-0-8-1.html
index cd541e33c..01a7f4755 100644
--- a/site/releases/spark-release-0-8-1.html
+++ b/site/releases/spark-release-0-8-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-9-0.html b/site/releases/spark-release-0-9-0.html
index dfb47a5ad..dc6ed3be1 100644
--- a/site/releases/spark-release-0-9-0.html
+++ b/site/releases/spark-release-0-9-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-9-1.html b/site/releases/spark-release-0-9-1.html
index 656d383a4..5d6619020 100644
--- a/site/releases/spark-release-0-9-1.html
+++ b/site/releases/spark-release-0-9-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-0-9-2.html b/site/releases/spark-release-0-9-2.html
index 0c0426dab..bf2150b28 100644
--- a/site/releases/spark-release-0-9-2.html
+++ b/site/releases/spark-release-0-9-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-0-0.html b/site/releases/spark-release-1-0-0.html
index 7096c8351..7ffc80d9e 100644
--- a/site/releases/spark-release-1-0-0.html
+++ b/site/releases/spark-release-1-0-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-0-1.html b/site/releases/spark-release-1-0-1.html
index a15b5e02e..be1c3f112 100644
--- a/site/releases/spark-release-1-0-1.html
+++ b/site/releases/spark-release-1-0-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-0-2.html b/site/releases/spark-release-1-0-2.html
index ffcf953e5..c773d053e 100644
--- a/site/releases/spark-release-1-0-2.html
+++ b/site/releases/spark-release-1-0-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-1-0.html b/site/releases/spark-release-1-1-0.html
index 3776fea79..0f9da7d41 100644
--- a/site/releases/spark-release-1-1-0.html
+++ b/site/releases/spark-release-1-1-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-1-1.html b/site/releases/spark-release-1-1-1.html
index 387511a21..d772305c9 100644
--- a/site/releases/spark-release-1-1-1.html
+++ b/site/releases/spark-release-1-1-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-2-0.html b/site/releases/spark-release-1-2-0.html
index 82bd94450..7c557163a 100644
--- a/site/releases/spark-release-1-2-0.html
+++ b/site/releases/spark-release-1-2-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-2-1.html b/site/releases/spark-release-1-2-1.html
index 34aa2c347..fd9640409 100644
--- a/site/releases/spark-release-1-2-1.html
+++ b/site/releases/spark-release-1-2-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-2-2.html b/site/releases/spark-release-1-2-2.html
index f95850db3..351d4ffc2 100644
--- a/site/releases/spark-release-1-2-2.html
+++ b/site/releases/spark-release-1-2-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-3-0.html b/site/releases/spark-release-1-3-0.html
index b8cffe3cf..90a25f99f 100644
--- a/site/releases/spark-release-1-3-0.html
+++ b/site/releases/spark-release-1-3-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-3-1.html b/site/releases/spark-release-1-3-1.html
index feb9bb1f2..b14d6129e 100644
--- a/site/releases/spark-release-1-3-1.html
+++ b/site/releases/spark-release-1-3-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-4-0.html b/site/releases/spark-release-1-4-0.html
index f1a17be9e..525ff7786 100644
--- a/site/releases/spark-release-1-4-0.html
+++ b/site/releases/spark-release-1-4-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-4-1.html b/site/releases/spark-release-1-4-1.html
index 8e691ac56..df8a167cc 100644
--- a/site/releases/spark-release-1-4-1.html
+++ b/site/releases/spark-release-1-4-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-5-0.html b/site/releases/spark-release-1-5-0.html
index d084008ea..82c2d9288 100644
--- a/site/releases/spark-release-1-5-0.html
+++ b/site/releases/spark-release-1-5-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-5-1.html b/site/releases/spark-release-1-5-1.html
index 02973c670..f06fab1ed 100644
--- a/site/releases/spark-release-1-5-1.html
+++ b/site/releases/spark-release-1-5-1.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-5-2.html b/site/releases/spark-release-1-5-2.html
index 96234a143..119d442b7 100644
--- a/site/releases/spark-release-1-5-2.html
+++ b/site/releases/spark-release-1-5-2.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/releases/spark-release-1-6-0.html b/site/releases/spark-release-1-6-0.html
index 1084f1a20..e2aeedcde 100644
--- a/site/releases/spark-release-1-6-0.html
+++ b/site/releases/spark-release-1-6-0.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/research.html b/site/research.html
index 3e959c225..69edbbf10 100644
--- a/site/research.html
+++ b/site/research.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/screencasts/1-first-steps-with-spark.html b/site/screencasts/1-first-steps-with-spark.html
index cc383cd4b..fe3171a69 100644
--- a/site/screencasts/1-first-steps-with-spark.html
+++ b/site/screencasts/1-first-steps-with-spark.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/screencasts/2-spark-documentation-overview.html b/site/screencasts/2-spark-documentation-overview.html
index 7780ad6fd..d6e1b51fd 100644
--- a/site/screencasts/2-spark-documentation-overview.html
+++ b/site/screencasts/2-spark-documentation-overview.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/screencasts/3-transformations-and-caching.html b/site/screencasts/3-transformations-and-caching.html
index 1bb09e209..93743918f 100644
--- a/site/screencasts/3-transformations-and-caching.html
+++ b/site/screencasts/3-transformations-and-caching.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/screencasts/4-a-standalone-job-in-spark.html b/site/screencasts/4-a-standalone-job-in-spark.html
index 4526162d0..9c017ff5a 100644
--- a/site/screencasts/4-a-standalone-job-in-spark.html
+++ b/site/screencasts/4-a-standalone-job-in-spark.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/screencasts/index.html b/site/screencasts/index.html
index 665613fd4..4725e2bc6 100644
--- a/site/screencasts/index.html
+++ b/site/screencasts/index.html
@@ -18,6 +18,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/sql/index.html b/site/sql/index.html
index ae27de006..b044dcb12 100644
--- a/site/sql/index.html
+++ b/site/sql/index.html
@@ -20,6 +20,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];
diff --git a/site/streaming/index.html b/site/streaming/index.html
index 4d6cf9b4c..faf7781f1 100644
--- a/site/streaming/index.html
+++ b/site/streaming/index.html
@@ -20,6 +20,9 @@
<link href="/css/cerulean.min.css" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
+ <!-- Code highlighter CSS -->
+ <link href="/css/pygments-default.css" rel="stylesheet">
+
<script type="text/javascript">
<!-- Google Analytics initialization -->
var _gaq = _gaq || [];