summaryrefslogtreecommitdiff
path: root/site/docs/1.0.1/sql-programming-guide.html
diff options
context:
space:
mode:
authorPatrick Wendell <pwendell@apache.org>2014-07-11 17:23:23 +0000
committerPatrick Wendell <pwendell@apache.org>2014-07-11 17:23:23 +0000
commit0beac4e243f85e71554fe04093b09eb1745fea82 (patch)
treebc20d10426c5d57e2f189305865dc2bbec447923 /site/docs/1.0.1/sql-programming-guide.html
parentddec2123ba6ab95543d1b250d4f20fb811c48f09 (diff)
downloadspark-website-0beac4e243f85e71554fe04093b09eb1745fea82.tar.gz
spark-website-0beac4e243f85e71554fe04093b09eb1745fea82.tar.bz2
spark-website-0beac4e243f85e71554fe04093b09eb1745fea82.zip
Updating docs for 1.0.1 release
Diffstat (limited to 'site/docs/1.0.1/sql-programming-guide.html')
-rw-r--r--site/docs/1.0.1/sql-programming-guide.html726
1 files changed, 726 insertions, 0 deletions
diff --git a/site/docs/1.0.1/sql-programming-guide.html b/site/docs/1.0.1/sql-programming-guide.html
new file mode 100644
index 000000000..683cea399
--- /dev/null
+++ b/site/docs/1.0.1/sql-programming-guide.html
@@ -0,0 +1,726 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+ <title>Spark SQL Programming Guide - Spark 1.0.1 Documentation</title>
+ <meta name="description" content="">
+
+
+
+ <link rel="stylesheet" href="css/bootstrap.min.css">
+ <style>
+ body {
+ padding-top: 60px;
+ padding-bottom: 40px;
+ }
+ </style>
+ <meta name="viewport" content="width=device-width">
+ <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
+ <link rel="stylesheet" href="css/main.css">
+
+ <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
+
+ <link rel="stylesheet" href="css/pygments-default.css">
+
+
+ <!-- Google analytics script -->
+ <script type="text/javascript">
+ var _gaq = _gaq || [];
+ _gaq.push(['_setAccount', 'UA-32518208-1']);
+ _gaq.push(['_trackPageview']);
+
+ (function() {
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+ })();
+ </script>
+
+
+ </head>
+ <body>
+ <!--[if lt IE 7]>
+ <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
+ <![endif]-->
+
+ <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
+
+ <div class="navbar navbar-fixed-top" id="topbar">
+ <div class="navbar-inner">
+ <div class="container">
+ <div class="brand"><a href="index.html">
+ <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.0.1</span>
+ </div>
+ <ul class="nav">
+ <!--TODO(andyk): Add class="active" attribute to li some how.-->
+ <li><a href="index.html">Overview</a></li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="quick-start.html">Quick Start</a></li>
+ <li><a href="programming-guide.html">Spark Programming Guide</a></li>
+ <li class="divider"></li>
+ <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+ <li><a href="sql-programming-guide.html">Spark SQL</a></li>
+ <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
+ <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
+ <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="api/scala/index.html#org.apache.spark.package">Scaladoc</a></li>
+ <li><a href="api/java/index.html">Javadoc</a></li>
+ <li><a href="api/python/index.html">Python API</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="cluster-overview.html">Overview</a></li>
+ <li><a href="submitting-applications.html">Submitting Applications</a></li>
+ <li class="divider"></li>
+ <li><a href="ec2-scripts.html">Amazon EC2</a></li>
+ <li><a href="spark-standalone.html">Standalone Mode</a></li>
+ <li><a href="running-on-mesos.html">Mesos</a></li>
+ <li><a href="running-on-yarn.html">YARN</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="configuration.html">Configuration</a></li>
+ <li><a href="monitoring.html">Monitoring</a></li>
+ <li><a href="tuning.html">Tuning Guide</a></li>
+ <li><a href="job-scheduling.html">Job Scheduling</a></li>
+ <li><a href="security.html">Security</a></li>
+ <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
+ <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
+ <li class="divider"></li>
+ <li><a href="building-with-maven.html">Building Spark with Maven</a></li>
+ <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+ </ul>
+ </li>
+ </ul>
+ <!--<p class="navbar-text pull-right"><span class="version-text">v1.0.1</span></p>-->
+ </div>
+ </div>
+ </div>
+
+ <div class="container" id="content">
+
+ <h1 class="title">Spark SQL Programming Guide</h1>
+
+
+ <ul id="markdown-toc">
+ <li><a href="#overview">Overview</a></li>
+ <li><a href="#getting-started">Getting Started</a></li>
+ <li><a href="#data-sources">Data Sources</a> <ul>
+ <li><a href="#rdds">RDDs</a></li>
+ <li><a href="#parquet-files">Parquet Files</a></li>
+ <li><a href="#json-datasets">JSON Datasets</a></li>
+ <li><a href="#hive-tables">Hive Tables</a></li>
+ </ul>
+ </li>
+ <li><a href="#writing-language-integrated-relational-queries">Writing Language-Integrated Relational Queries</a></li>
+</ul>
+
+<h1 id="overview">Overview</h1>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <p>Spark SQL allows relational queries expressed in SQL, HiveQL, or Scala to be executed using
+Spark. At the core of this component is a new type of RDD,
+<a href="api/scala/index.html#org.apache.spark.sql.SchemaRDD">SchemaRDD</a>. SchemaRDDs are composed
+<a href="api/scala/index.html#org.apache.spark.sql.catalyst.expressions.Row">Row</a> objects along with
+a schema that describes the data types of each column in the row. A SchemaRDD is similar to a table
+in a traditional relational database. A SchemaRDD can be created from an existing RDD, <a href="http://parquet.io">Parquet</a>
+file, a JSON dataset, or by running HiveQL against data stored in <a href="http://hive.apache.org/">Apache Hive</a>.</p>
+
+ <p>All of the examples on this page use sample data included in the Spark distribution and can be run in the <code>spark-shell</code>.</p>
+
+ </div>
+
+<div data-lang="java">
+ <p>Spark SQL allows relational queries expressed in SQL or HiveQL to be executed using
+Spark. At the core of this component is a new type of RDD,
+<a href="api/scala/index.html#org.apache.spark.sql.api.java.JavaSchemaRDD">JavaSchemaRDD</a>. JavaSchemaRDDs are composed
+<a href="api/scala/index.html#org.apache.spark.sql.api.java.Row">Row</a> objects along with
+a schema that describes the data types of each column in the row. A JavaSchemaRDD is similar to a table
+in a traditional relational database. A JavaSchemaRDD can be created from an existing RDD, <a href="http://parquet.io">Parquet</a>
+file, a JSON dataset, or by running HiveQL against data stored in <a href="http://hive.apache.org/">Apache Hive</a>.</p>
+ </div>
+
+<div data-lang="python">
+
+ <p>Spark SQL allows relational queries expressed in SQL or HiveQL to be executed using
+Spark. At the core of this component is a new type of RDD,
+<a href="api/python/pyspark.sql.SchemaRDD-class.html">SchemaRDD</a>. SchemaRDDs are composed
+<a href="api/python/pyspark.sql.Row-class.html">Row</a> objects along with
+a schema that describes the data types of each column in the row. A SchemaRDD is similar to a table
+in a traditional relational database. A SchemaRDD can be created from an existing RDD, <a href="http://parquet.io">Parquet</a>
+file, a JSON dataset, or by running HiveQL against data stored in <a href="http://hive.apache.org/">Apache Hive</a>.</p>
+
+ <p>All of the examples on this page use sample data included in the Spark distribution and can be run in the <code>pyspark</code> shell.</p>
+ </div>
+</div>
+
+<p><strong>Spark SQL is currently an alpha component. While we will minimize API changes, some APIs may change in future releases.</strong></p>
+
+<hr />
+
+<h1 id="getting-started">Getting Started</h1>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <p>The entry point into all relational functionality in Spark is the
+<a href="api/scala/index.html#org.apache.spark.sql.SQLContext">SQLContext</a> class, or one of its
+descendants. To create a basic SQLContext, all you need is a SparkContext.</p>
+
+ <div class="highlight"><pre><code class="scala"><span class="k">val</span> <span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span> <span class="c1">// An existing SparkContext.</span>
+<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
+
+<span class="c1">// createSchemaRDD is used to implicitly convert an RDD to a SchemaRDD.</span>
+<span class="k">import</span> <span class="nn">sqlContext.createSchemaRDD</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p>The entry point into all relational functionality in Spark is the
+<a href="api/scala/index.html#org.apache.spark.sql.api.java.JavaSQLContext">JavaSQLContext</a> class, or one
+of its descendants. To create a basic JavaSQLContext, all you need is a JavaSparkContext.</p>
+
+ <div class="highlight"><pre><code class="java"><span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="o">...;</span> <span class="c1">// An existing JavaSparkContext.</span>
+<span class="n">JavaSQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">api</span><span class="o">.</span><span class="na">java</span><span class="o">.</span><span class="na">JavaSQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p>The entry point into all relational functionality in Spark is the
+<a href="api/python/pyspark.sql.SQLContext-class.html">SQLContext</a> class, or one
+of its decedents. To create a basic SQLContext, all you need is a SparkContext.</p>
+
+ <div class="highlight"><pre><code class="python"><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
+<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
+</code></pre></div>
+
+ </div>
+
+</div>
+
+<h1 id="data-sources">Data Sources</h1>
+
+<div class="codetabs">
+<div data-lang="scala">
+ <p>Spark SQL supports operating on a variety of data sources through the <code>SchemaRDD</code> interface.
+Once a dataset has been loaded, it can be registered as a table and even joined with data from other sources.</p>
+ </div>
+
+<div data-lang="java">
+ <p>Spark SQL supports operating on a variety of data sources through the <code>JavaSchemaRDD</code> interface.
+Once a dataset has been loaded, it can be registered as a table and even joined with data from other sources.</p>
+ </div>
+
+<div data-lang="python">
+ <p>Spark SQL supports operating on a variety of data sources through the <code>SchemaRDD</code> interface.
+Once a dataset has been loaded, it can be registered as a table and even joined with data from other sources.</p>
+ </div>
+</div>
+
+<h2 id="rdds">RDDs</h2>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <p>One type of table that is supported by Spark SQL is an RDD of Scala case classes. The case class
+defines the schema of the table. The names of the arguments to the case class are read using
+reflection and become the names of the columns. Case classes can also be nested or contain complex
+types such as Sequences or Arrays. This RDD can be implicitly converted to a SchemaRDD and then be
+registered as a table. Tables can be used in subsequent SQL statements.</p>
+
+ <div class="highlight"><pre><code class="scala"><span class="c1">// sc is an existing SparkContext.</span>
+<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
+<span class="c1">// createSchemaRDD is used to implicitly convert an RDD to a SchemaRDD.</span>
+<span class="k">import</span> <span class="nn">sqlContext.createSchemaRDD</span>
+
+<span class="c1">// Define the schema using a case class.</span>
+<span class="c1">// Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit, </span>
+<span class="c1">// you can use custom classes that implement the Product interface.</span>
+<span class="k">case</span> <span class="k">class</span> <span class="nc">Person</span><span class="o">(</span><span class="n">name</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> <span class="n">age</span><span class="k">:</span> <span class="kt">Int</span><span class="o">)</span>
+
+<span class="c1">// Create an RDD of Person objects and register it as a table.</span>
+<span class="k">val</span> <span class="n">people</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot;,&quot;</span><span class="o">)).</span><span class="n">map</span><span class="o">(</span><span class="n">p</span> <span class="k">=&gt;</span> <span class="nc">Person</span><span class="o">(</span><span class="n">p</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span> <span class="n">p</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">trim</span><span class="o">.</span><span class="n">toInt</span><span class="o">))</span>
+<span class="n">people</span><span class="o">.</span><span class="n">registerAsTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">)</span>
+
+<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
+<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
+
+<span class="c1">// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.</span>
+<span class="c1">// The columns of a row in the result can be accessed by ordinal.</span>
+<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p>One type of table that is supported by Spark SQL is an RDD of <a href="http://stackoverflow.com/questions/3295496/what-is-a-javabean-exactly">JavaBeans</a>. The BeanInfo
+defines the schema of the table. Currently, Spark SQL does not support JavaBeans that contain
+nested or contain complex types such as Lists or Arrays. You can create a JavaBean by creating a
+class that implements Serializable and has getters and setters for all of its fields.</p>
+
+ <div class="highlight"><pre><code class="java"><span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Person</span> <span class="kd">implements</span> <span class="n">Serializable</span> <span class="o">{</span>
+ <span class="kd">private</span> <span class="n">String</span> <span class="n">name</span><span class="o">;</span>
+ <span class="kd">private</span> <span class="kt">int</span> <span class="n">age</span><span class="o">;</span>
+
+ <span class="kd">public</span> <span class="n">String</span> <span class="nf">getName</span><span class="o">()</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="n">name</span><span class="o">;</span>
+ <span class="o">}</span>
+
+ <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setName</span><span class="o">(</span><span class="n">String</span> <span class="n">name</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">this</span><span class="o">.</span><span class="na">name</span> <span class="o">=</span> <span class="n">name</span><span class="o">;</span>
+ <span class="o">}</span>
+
+ <span class="kd">public</span> <span class="kt">int</span> <span class="nf">getAge</span><span class="o">()</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="n">age</span><span class="o">;</span>
+ <span class="o">}</span>
+
+ <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setAge</span><span class="o">(</span><span class="kt">int</span> <span class="n">age</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">this</span><span class="o">.</span><span class="na">age</span> <span class="o">=</span> <span class="n">age</span><span class="o">;</span>
+ <span class="o">}</span>
+<span class="o">}</span>
+</code></pre></div>
+
+ <p>A schema can be applied to an existing RDD by calling <code>applySchema</code> and providing the Class object
+for the JavaBean.</p>
+
+ <div class="highlight"><pre><code class="java"><span class="c1">// sc is an existing JavaSparkContext.</span>
+<span class="n">JavaSQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">api</span><span class="o">.</span><span class="na">java</span><span class="o">.</span><span class="na">JavaSQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
+
+<span class="c1">// Load a text file and convert each line to a JavaBean.</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Person</span><span class="o">&gt;</span> <span class="n">people</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="o">).</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Person</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Person</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">line</span><span class="o">)</span> <span class="kd">throws</span> <span class="n">Exception</span> <span class="o">{</span>
+ <span class="n">String</span><span class="o">[]</span> <span class="n">parts</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot;,&quot;</span><span class="o">);</span>
+
+ <span class="n">Person</span> <span class="n">person</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Person</span><span class="o">();</span>
+ <span class="n">person</span><span class="o">.</span><span class="na">setName</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">0</span><span class="o">]);</span>
+ <span class="n">person</span><span class="o">.</span><span class="na">setAge</span><span class="o">(</span><span class="n">Integer</span><span class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">1</span><span class="o">].</span><span class="na">trim</span><span class="o">()));</span>
+
+ <span class="k">return</span> <span class="n">person</span><span class="o">;</span>
+ <span class="o">}</span>
+ <span class="o">});</span>
+
+<span class="c1">// Apply a schema to an RDD of JavaBeans and register it as a table.</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">applySchema</span><span class="o">(</span><span class="n">people</span><span class="o">,</span> <span class="n">Person</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
+<span class="n">schemaPeople</span><span class="o">.</span><span class="na">registerAsTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">);</span>
+
+<span class="c1">// SQL can be run over RDDs that have been registered as tables.</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
+
+<span class="c1">// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.</span>
+<span class="c1">// The columns of a row in the result can be accessed by ordinal.</span>
+<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">teenagerNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">String</span> <span class="nf">call</span><span class="o">(</span><span class="n">Row</span> <span class="n">row</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+ <span class="o">}</span>
+<span class="o">}).</span><span class="na">collect</span><span class="o">();</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p>One type of table that is supported by Spark SQL is an RDD of dictionaries. The keys of the
+dictionary define the columns names of the table, and the types are inferred by looking at the first
+row. Any RDD of dictionaries can converted to a SchemaRDD and then registered as a table. Tables
+can be used in subsequent SQL statements.</p>
+
+ <div class="highlight"><pre><code class="python"><span class="c"># sc is an existing SparkContext.</span>
+<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
+<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
+
+<span class="c"># Load a text file and convert each line to a dictionary.</span>
+<span class="n">lines</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="p">)</span>
+<span class="n">parts</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">l</span><span class="p">:</span> <span class="n">l</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot;,&quot;</span><span class="p">))</span>
+<span class="n">people</span> <span class="o">=</span> <span class="n">parts</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">{</span><span class="s">&quot;name&quot;</span><span class="p">:</span> <span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="s">&quot;age&quot;</span><span class="p">:</span> <span class="nb">int</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">1</span><span class="p">])})</span>
+
+<span class="c"># Infer the schema, and register the SchemaRDD as a table.</span>
+<span class="c"># In future versions of PySpark we would like to add support for registering RDDs with other</span>
+<span class="c"># datatypes as tables</span>
+<span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">inferSchema</span><span class="p">(</span><span class="n">people</span><span class="p">)</span>
+<span class="n">schemaPeople</span><span class="o">.</span><span class="n">registerAsTable</span><span class="p">(</span><span class="s">&quot;people&quot;</span><span class="p">)</span>
+
+<span class="c"># SQL can be run over SchemaRDDs that have been registered as a table.</span>
+<span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
+
+<span class="c"># The results of SQL queries are RDDs and support all the normal RDD operations.</span>
+<span class="n">teenNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">p</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">teenName</span> <span class="ow">in</span> <span class="n">teenNames</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span>
+ <span class="k">print</span> <span class="n">teenName</span>
+</code></pre></div>
+
+ </div>
+
+</div>
+
+<p><strong>Note that Spark SQL currently uses a very basic SQL parser.</strong>
+Users that want a more complete dialect of SQL should look at the HiveQL support provided by
+<code>HiveContext</code>.</p>
+
+<h2 id="parquet-files">Parquet Files</h2>
+
+<p><a href="http://parquet.io">Parquet</a> is a columnar format that is supported by many other data processing systems.
+Spark SQL provides support for both reading and writing Parquet files that automatically preserves the schema
+of the original data. Using the data from the above example:</p>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="scala"><span class="c1">// sqlContext from the previous example is used in this example.</span>
+<span class="c1">// createSchemaRDD is used to implicitly convert an RDD to a SchemaRDD.</span>
+<span class="k">import</span> <span class="nn">sqlContext.createSchemaRDD</span>
+
+<span class="k">val</span> <span class="n">people</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Person</span><span class="o">]</span> <span class="k">=</span> <span class="o">...</span> <span class="c1">// An RDD of case class objects, from the previous example.</span>
+
+<span class="c1">// The RDD is implicitly converted to a SchemaRDD by createSchemaRDD, allowing it to be stored using Parquet.</span>
+<span class="n">people</span><span class="o">.</span><span class="n">saveAsParquetFile</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">)</span>
+
+<span class="c1">// Read in the parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
+<span class="c1">// The result of loading a Parquet file is also a SchemaRDD.</span>
+<span class="k">val</span> <span class="n">parquetFile</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">parquetFile</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">)</span>
+
+<span class="c1">//Parquet files can also be registered as tables and then used in SQL statements.</span>
+<span class="n">parquetFile</span><span class="o">.</span><span class="n">registerAsTable</span><span class="o">(</span><span class="s">&quot;parquetFile&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
+<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="java"><span class="c1">// sqlContext from the previous example is used in this example.</span>
+
+<span class="n">JavaSchemaRDD</span> <span class="n">schemaPeople</span> <span class="o">=</span> <span class="o">...</span> <span class="c1">// The JavaSchemaRDD from the previous example.</span>
+
+<span class="c1">// JavaSchemaRDDs can be saved as Parquet files, maintaining the schema information.</span>
+<span class="n">schemaPeople</span><span class="o">.</span><span class="na">saveAsParquetFile</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">);</span>
+
+<span class="c1">// Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
+<span class="c1">// The result of loading a parquet file is also a JavaSchemaRDD.</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">parquetFile</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">parquetFile</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">);</span>
+
+<span class="c1">//Parquet files can also be registered as tables and then used in SQL statements.</span>
+<span class="n">parquetFile</span><span class="o">.</span><span class="na">registerAsTable</span><span class="o">(</span><span class="s">&quot;parquetFile&quot;</span><span class="o">);</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">);</span>
+<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">teenagerNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">String</span> <span class="nf">call</span><span class="o">(</span><span class="n">Row</span> <span class="n">row</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+ <span class="o">}</span>
+<span class="o">}).</span><span class="na">collect</span><span class="o">();</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="python"><span class="c"># sqlContext from the previous example is used in this example.</span>
+
+<span class="n">schemaPeople</span> <span class="c"># The SchemaRDD from the previous example.</span>
+
+<span class="c"># SchemaRDDs can be saved as Parquet files, maintaining the schema information.</span>
+<span class="n">schemaPeople</span><span class="o">.</span><span class="n">saveAsParquetFile</span><span class="p">(</span><span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
+
+<span class="c"># Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
+<span class="c"># The result of loading a parquet file is also a SchemaRDD.</span>
+<span class="n">parquetFile</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">parquetFile</span><span class="p">(</span><span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
+
+<span class="c"># Parquet files can also be registered as tables and then used in SQL statements.</span>
+<span class="n">parquetFile</span><span class="o">.</span><span class="n">registerAsTable</span><span class="p">(</span><span class="s">&quot;parquetFile&quot;</span><span class="p">);</span>
+<span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
+<span class="n">teenNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">p</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">teenName</span> <span class="ow">in</span> <span class="n">teenNames</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span>
+ <span class="k">print</span> <span class="n">teenName</span>
+</code></pre></div>
+
+ </div>
+
+</div>
+
+<h2 id="json-datasets">JSON Datasets</h2>
+<div class="codetabs">
+
+<div data-lang="scala">
+ <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a SchemaRDD.
+This conversion can be done using one of two methods in a SQLContext:</p>
+
+ <ul>
+ <li><code>jsonFile</code> - loads data from a directory of JSON files where each line of the files is a JSON object.</li>
+ <li><code>jsonRdd</code> - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.</li>
+ </ul>
+
+ <div class="highlight"><pre><code class="scala"><span class="c1">// sc is an existing SparkContext.</span>
+<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
+
+<span class="c1">// A JSON dataset is pointed to by path.</span>
+<span class="c1">// The path can be either a single text file or a directory storing text files.</span>
+<span class="k">val</span> <span class="n">path</span> <span class="k">=</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span>
+<span class="c1">// Create a SchemaRDD from the file(s) pointed to by path</span>
+<span class="k">val</span> <span class="n">people</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">jsonFile</span><span class="o">(</span><span class="n">path</span><span class="o">)</span>
+
+<span class="c1">// The inferred schema can be visualized using the printSchema() method.</span>
+<span class="n">people</span><span class="o">.</span><span class="n">printSchema</span><span class="o">()</span>
+<span class="c1">// root</span>
+<span class="c1">// |-- age: IntegerType</span>
+<span class="c1">// |-- name: StringType</span>
+
+<span class="c1">// Register this SchemaRDD as a table.</span>
+<span class="n">people</span><span class="o">.</span><span class="n">registerAsTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">)</span>
+
+<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
+<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
+
+<span class="c1">// Alternatively, a SchemaRDD can be created for a JSON dataset represented by</span>
+<span class="c1">// an RDD[String] storing one JSON object per string.</span>
+<span class="k">val</span> <span class="n">anotherPeopleRDD</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span>
+ <span class="s">&quot;&quot;&quot;{&quot;name&quot;:&quot;Yin&quot;,&quot;address&quot;:{&quot;city&quot;:&quot;Columbus&quot;,&quot;state&quot;:&quot;Ohio&quot;}}&quot;&quot;&quot;</span> <span class="o">::</span> <span class="nc">Nil</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">anotherPeople</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">jsonRDD</span><span class="o">(</span><span class="n">anotherPeopleRDD</span><span class="o">)</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+ <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a JavaSchemaRDD.
+This conversion can be done using one of two methods in a JavaSQLContext :</p>
+
+ <ul>
+ <li><code>jsonFile</code> - loads data from a directory of JSON files where each line of the files is a JSON object.</li>
+ <li><code>jsonRdd</code> - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.</li>
+ </ul>
+
+ <div class="highlight"><pre><code class="java"><span class="c1">// sc is an existing JavaSparkContext.</span>
+<span class="n">JavaSQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">api</span><span class="o">.</span><span class="na">java</span><span class="o">.</span><span class="na">JavaSQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
+
+<span class="c1">// A JSON dataset is pointed to by path.</span>
+<span class="c1">// The path can be either a single text file or a directory storing text files.</span>
+<span class="n">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">;</span>
+<span class="c1">// Create a JavaSchemaRDD from the file(s) pointed to by path</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">people</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">jsonFile</span><span class="o">(</span><span class="n">path</span><span class="o">);</span>
+
+<span class="c1">// The inferred schema can be visualized using the printSchema() method.</span>
+<span class="n">people</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span>
+<span class="c1">// root</span>
+<span class="c1">// |-- age: IntegerType</span>
+<span class="c1">// |-- name: StringType</span>
+
+<span class="c1">// Register this JavaSchemaRDD as a table.</span>
+<span class="n">people</span><span class="o">.</span><span class="na">registerAsTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">);</span>
+
+<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">);</span>
+
+<span class="c1">// Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by</span>
+<span class="c1">// an RDD[String] storing one JSON object per string.</span>
+<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">jsonData</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="s">&quot;{\&quot;name\&quot;:\&quot;Yin\&quot;,\&quot;address\&quot;:{\&quot;city\&quot;:\&quot;Columbus\&quot;,\&quot;state\&quot;:\&quot;Ohio\&quot;}}&quot;</span><span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">anotherPeopleRDD</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">jsonData</span><span class="o">);</span>
+<span class="n">JavaSchemaRDD</span> <span class="n">anotherPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">jsonRDD</span><span class="o">(</span><span class="n">anotherPeopleRDD</span><span class="o">);</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+ <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a SchemaRDD.
+This conversion can be done using one of two methods in a SQLContext:</p>
+
+ <ul>
+ <li><code>jsonFile</code> - loads data from a directory of JSON files where each line of the files is a JSON object.</li>
+ <li><code>jsonRdd</code> - loads data from an existing RDD where each element of the RDD is a string containing a JSON object.</li>
+ </ul>
+
+ <div class="highlight"><pre><code class="python"><span class="c"># sc is an existing SparkContext.</span>
+<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
+<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
+
+<span class="c"># A JSON dataset is pointed to by path.</span>
+<span class="c"># The path can be either a single text file or a directory storing text files.</span>
+<span class="n">path</span> <span class="o">=</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span>
+<span class="c"># Create a SchemaRDD from the file(s) pointed to by path</span>
+<span class="n">people</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">jsonFile</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
+
+<span class="c"># The inferred schema can be visualized using the printSchema() method.</span>
+<span class="n">people</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
+<span class="c"># root</span>
+<span class="c"># |-- age: IntegerType</span>
+<span class="c"># |-- name: StringType</span>
+
+<span class="c"># Register this SchemaRDD as a table.</span>
+<span class="n">people</span><span class="o">.</span><span class="n">registerAsTable</span><span class="p">(</span><span class="s">&quot;people&quot;</span><span class="p">)</span>
+
+<span class="c"># SQL statements can be run by using the sql methods provided by sqlContext.</span>
+<span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
+
+<span class="c"># Alternatively, a SchemaRDD can be created for a JSON dataset represented by</span>
+<span class="c"># an RDD[String] storing one JSON object per string.</span>
+<span class="n">anotherPeopleRDD</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
+ <span class="s">&#39;{&quot;name&quot;:&quot;Yin&quot;,&quot;address&quot;:{&quot;city&quot;:&quot;Columbus&quot;,&quot;state&quot;:&quot;Ohio&quot;}}&#39;</span><span class="p">])</span>
+<span class="n">anotherPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">jsonRDD</span><span class="p">(</span><span class="n">anotherPeopleRDD</span><span class="p">)</span>
+</code></pre></div>
+
+ </div>
+
+</div>
+
+<h2 id="hive-tables">Hive Tables</h2>
+
+<p>Spark SQL also supports reading and writing data stored in <a href="http://hive.apache.org/">Apache Hive</a>.
+However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
+In order to use Hive you must first run &#8216;<code>SPARK_HIVE=true sbt/sbt assembly/assembly</code>&#8217; (or use <code>-Phive</code> for maven).
+This command builds a new assembly jar that includes Hive. Note that this Hive assembly jar must also be present
+on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
+(SerDes) in order to acccess data stored in Hive.</p>
+
+<p>Configuration of Hive is done by placing your <code>hive-site.xml</code> file in <code>conf/</code>.</p>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <p>When working with Hive one must construct a <code>HiveContext</code>, which inherits from <code>SQLContext</code>, and
+adds support for finding tables in in the MetaStore and writing queries using HiveQL. Users who do
+not have an existing Hive deployment can also experiment with the <code>LocalHiveContext</code>,
+which is similar to <code>HiveContext</code>, but creates a local copy of the <code>metastore</code> and <code>warehouse</code>
+automatically.</p>
+
+ <div class="highlight"><pre><code class="scala"><span class="c1">// sc is an existing SparkContext.</span>
+<span class="k">val</span> <span class="n">hiveContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="nc">HiveContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
+
+<span class="n">hiveContext</span><span class="o">.</span><span class="n">hql</span><span class="o">(</span><span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="o">)</span>
+<span class="n">hiveContext</span><span class="o">.</span><span class="n">hql</span><span class="o">(</span><span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="o">)</span>
+
+<span class="c1">// Queries are expressed in HiveQL</span>
+<span class="n">hiveContext</span><span class="o">.</span><span class="n">hql</span><span class="o">(</span><span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="o">).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p>When working with Hive one must construct a <code>JavaHiveContext</code>, which inherits from <code>JavaSQLContext</code>, and
+adds support for finding tables in in the MetaStore and writing queries using HiveQL. In addition to
+the <code>sql</code> method a <code>JavaHiveContext</code> also provides an <code>hql</code> methods, which allows queries to be
+expressed in HiveQL.</p>
+
+ <div class="highlight"><pre><code class="java"><span class="c1">// sc is an existing JavaSparkContext.</span>
+<span class="n">JavaHiveContext</span> <span class="n">hiveContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">hive</span><span class="o">.</span><span class="na">api</span><span class="o">.</span><span class="na">java</span><span class="o">.</span><span class="na">HiveContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
+
+<span class="n">hiveContext</span><span class="o">.</span><span class="na">hql</span><span class="o">(</span><span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="o">);</span>
+<span class="n">hiveContext</span><span class="o">.</span><span class="na">hql</span><span class="o">(</span><span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="o">);</span>
+
+<span class="c1">// Queries are expressed in HiveQL.</span>
+<span class="n">Row</span><span class="o">[]</span> <span class="n">results</span> <span class="o">=</span> <span class="n">hiveContext</span><span class="o">.</span><span class="na">hql</span><span class="o">(</span><span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="o">).</span><span class="na">collect</span><span class="o">();</span>
+</code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p>When working with Hive one must construct a <code>HiveContext</code>, which inherits from <code>SQLContext</code>, and
+adds support for finding tables in in the MetaStore and writing queries using HiveQL. In addition to
+the <code>sql</code> method a <code>HiveContext</code> also provides an <code>hql</code> methods, which allows queries to be
+expressed in HiveQL.</p>
+
+ <div class="highlight"><pre><code class="python"><span class="c"># sc is an existing SparkContext.</span>
+<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">HiveContext</span>
+<span class="n">hiveContext</span> <span class="o">=</span> <span class="n">HiveContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
+
+<span class="n">hiveContext</span><span class="o">.</span><span class="n">hql</span><span class="p">(</span><span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="p">)</span>
+<span class="n">hiveContext</span><span class="o">.</span><span class="n">hql</span><span class="p">(</span><span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="p">)</span>
+
+<span class="c"># Queries can be expressed in HiveQL.</span>
+<span class="n">results</span> <span class="o">=</span> <span class="n">hiveContext</span><span class="o">.</span><span class="n">hql</span><span class="p">(</span><span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
+</code></pre></div>
+
+ </div>
+</div>
+
+<h1 id="writing-language-integrated-relational-queries">Writing Language-Integrated Relational Queries</h1>
+
+<p><strong>Language-Integrated queries are currently only supported in Scala.</strong></p>
+
+<p>Spark SQL also supports a domain specific language for writing queries. Once again,
+using the data from the above examples:</p>
+
+<div class="highlight"><pre><code class="scala"><span class="c1">// sc is an existing SparkContext.</span>
+<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
+<span class="c1">// Importing the SQL context gives access to all the public SQL functions and implicit conversions.</span>
+<span class="k">import</span> <span class="nn">sqlContext._</span>
+<span class="k">val</span> <span class="n">people</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Person</span><span class="o">]</span> <span class="k">=</span> <span class="o">...</span> <span class="c1">// An RDD of case class objects, from the first example.</span>
+
+<span class="c1">// The following is the same as &#39;SELECT name FROM people WHERE age &gt;= 10 AND age &lt;= 19&#39;</span>
+<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">people</span><span class="o">.</span><span class="n">where</span><span class="o">(</span><span class="-Symbol">&#39;age</span> <span class="o">&gt;=</span> <span class="mi">10</span><span class="o">).</span><span class="n">where</span><span class="o">(</span><span class="-Symbol">&#39;age</span> <span class="o">&lt;=</span> <span class="mi">19</span><span class="o">).</span><span class="n">select</span><span class="o">(</span><span class="-Symbol">&#39;name</span><span class="o">)</span>
+<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
+</code></pre></div>
+
+<p>The DSL uses Scala symbols to represent columns in the underlying table, which are identifiers
+prefixed with a tick (<code>'</code>). Implicit conversions turn these symbols into expressions that are
+evaluated by the SQL execution engine. A full list of the functions supported can be found in the
+<a href="api/scala/index.html#org.apache.spark.sql.SchemaRDD">ScalaDoc</a>.</p>
+
+<!-- TODO: Include the table of operations here. -->
+
+
+ </div> <!-- /container -->
+
+ <script src="js/vendor/jquery-1.8.0.min.js"></script>
+ <script src="js/vendor/bootstrap.min.js"></script>
+ <script src="js/main.js"></script>
+
+ <!-- MathJax Section -->
+ <script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ TeX: { equationNumbers: { autoNumber: "AMS" } }
+ });
+ </script>
+ <script>
+ // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
+ // We could use "//cdn.mathjax...", but that won't support "file://".
+ (function(d, script) {
+ script = d.createElement('script');
+ script.type = 'text/javascript';
+ script.async = true;
+ script.onload = function(){
+ MathJax.Hub.Config({
+ tex2jax: {
+ inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+ displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+ processEscapes: true,
+ skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+ }
+ });
+ };
+ script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
+ 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+ d.getElementsByTagName('head')[0].appendChild(script);
+ }(document));
+ </script>
+ </body>
+</html>