summaryrefslogtreecommitdiff
path: root/site/docs/1.5.0/sql-programming-guide.html
diff options
context:
space:
mode:
authorReynold Xin <rxin@apache.org>2015-09-17 22:07:42 +0000
committerReynold Xin <rxin@apache.org>2015-09-17 22:07:42 +0000
commitee9ffe89d608e7640a2487406b618d27e58026d6 (patch)
tree50ec819abb41a9a769d7f64eed1f0ab2084aa6ff /site/docs/1.5.0/sql-programming-guide.html
parentc7104724b279f09486ea62f4a24252e8d06f5c96 (diff)
downloadspark-website-ee9ffe89d608e7640a2487406b618d27e58026d6.tar.gz
spark-website-ee9ffe89d608e7640a2487406b618d27e58026d6.tar.bz2
spark-website-ee9ffe89d608e7640a2487406b618d27e58026d6.zip
delete 1.5.0
Diffstat (limited to 'site/docs/1.5.0/sql-programming-guide.html')
-rw-r--r--site/docs/1.5.0/sql-programming-guide.html3154
1 files changed, 0 insertions, 3154 deletions
diff --git a/site/docs/1.5.0/sql-programming-guide.html b/site/docs/1.5.0/sql-programming-guide.html
deleted file mode 100644
index 66b86392c..000000000
--- a/site/docs/1.5.0/sql-programming-guide.html
+++ /dev/null
@@ -1,3154 +0,0 @@
-<!DOCTYPE html>
-<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
-<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
-<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
-<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
- <head>
- <meta charset="utf-8">
- <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
- <title>Spark SQL and DataFrames - Spark 1.5.0 Documentation</title>
-
-
-
-
- <link rel="stylesheet" href="css/bootstrap.min.css">
- <style>
- body {
- padding-top: 60px;
- padding-bottom: 40px;
- }
- </style>
- <meta name="viewport" content="width=device-width">
- <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
- <link rel="stylesheet" href="css/main.css">
-
- <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
-
- <link rel="stylesheet" href="css/pygments-default.css">
-
-
- <!-- Google analytics script -->
- <script type="text/javascript">
- var _gaq = _gaq || [];
- _gaq.push(['_setAccount', 'UA-32518208-2']);
- _gaq.push(['_trackPageview']);
-
- (function() {
- var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
- ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
- var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
- })();
- </script>
-
-
- </head>
- <body>
- <!--[if lt IE 7]>
- <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
- <![endif]-->
-
- <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
-
- <div class="navbar navbar-fixed-top" id="topbar">
- <div class="navbar-inner">
- <div class="container">
- <div class="brand"><a href="index.html">
- <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.5.0</span>
- </div>
- <ul class="nav">
- <!--TODO(andyk): Add class="active" attribute to li some how.-->
- <li><a href="index.html">Overview</a></li>
-
- <li class="dropdown">
- <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
- <ul class="dropdown-menu">
- <li><a href="quick-start.html">Quick Start</a></li>
- <li><a href="programming-guide.html">Spark Programming Guide</a></li>
- <li class="divider"></li>
- <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
- <li><a href="sql-programming-guide.html">DataFrames and SQL</a></li>
- <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
- <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
- <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
- <li><a href="sparkr.html">SparkR (R on Spark)</a></li>
- </ul>
- </li>
-
- <li class="dropdown">
- <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
- <ul class="dropdown-menu">
- <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
- <li><a href="api/java/index.html">Java</a></li>
- <li><a href="api/python/index.html">Python</a></li>
- <li><a href="api/R/index.html">R</a></li>
- </ul>
- </li>
-
- <li class="dropdown">
- <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
- <ul class="dropdown-menu">
- <li><a href="cluster-overview.html">Overview</a></li>
- <li><a href="submitting-applications.html">Submitting Applications</a></li>
- <li class="divider"></li>
- <li><a href="spark-standalone.html">Spark Standalone</a></li>
- <li><a href="running-on-mesos.html">Mesos</a></li>
- <li><a href="running-on-yarn.html">YARN</a></li>
- <li class="divider"></li>
- <li><a href="ec2-scripts.html">Amazon EC2</a></li>
- </ul>
- </li>
-
- <li class="dropdown">
- <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
- <ul class="dropdown-menu">
- <li><a href="configuration.html">Configuration</a></li>
- <li><a href="monitoring.html">Monitoring</a></li>
- <li><a href="tuning.html">Tuning Guide</a></li>
- <li><a href="job-scheduling.html">Job Scheduling</a></li>
- <li><a href="security.html">Security</a></li>
- <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
- <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
- <li class="divider"></li>
- <li><a href="building-spark.html">Building Spark</a></li>
- <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
- <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
- </ul>
- </li>
- </ul>
- <!--<p class="navbar-text pull-right"><span class="version-text">v1.5.0</span></p>-->
- </div>
- </div>
- </div>
-
- <div class="container" id="content">
-
- <h1 class="title">Spark SQL and DataFrame Guide</h1>
-
-
- <ul id="markdown-toc">
- <li><a href="#overview">Overview</a></li>
- <li><a href="#dataframes">DataFrames</a> <ul>
- <li><a href="#starting-point-sqlcontext">Starting Point: SQLContext</a></li>
- <li><a href="#creating-dataframes">Creating DataFrames</a></li>
- <li><a href="#dataframe-operations">DataFrame Operations</a></li>
- <li><a href="#running-sql-queries-programmatically">Running SQL Queries Programmatically</a></li>
- <li><a href="#interoperating-with-rdds">Interoperating with RDDs</a> <ul>
- <li><a href="#inferring-the-schema-using-reflection">Inferring the Schema Using Reflection</a></li>
- <li><a href="#programmatically-specifying-the-schema">Programmatically Specifying the Schema</a></li>
- </ul>
- </li>
- </ul>
- </li>
- <li><a href="#data-sources">Data Sources</a> <ul>
- <li><a href="#generic-loadsave-functions">Generic Load/Save Functions</a> <ul>
- <li><a href="#manually-specifying-options">Manually Specifying Options</a></li>
- <li><a href="#save-modes">Save Modes</a></li>
- <li><a href="#saving-to-persistent-tables">Saving to Persistent Tables</a></li>
- </ul>
- </li>
- <li><a href="#parquet-files">Parquet Files</a> <ul>
- <li><a href="#loading-data-programmatically">Loading Data Programmatically</a></li>
- <li><a href="#partition-discovery">Partition Discovery</a></li>
- <li><a href="#schema-merging">Schema Merging</a></li>
- <li><a href="#hive-metastore-parquet-table-conversion">Hive metastore Parquet table conversion</a> <ul>
- <li><a href="#hiveparquet-schema-reconciliation">Hive/Parquet Schema Reconciliation</a></li>
- <li><a href="#metadata-refreshing">Metadata Refreshing</a></li>
- </ul>
- </li>
- <li><a href="#configuration">Configuration</a></li>
- </ul>
- </li>
- <li><a href="#json-datasets">JSON Datasets</a></li>
- <li><a href="#hive-tables">Hive Tables</a> <ul>
- <li><a href="#interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</a></li>
- </ul>
- </li>
- <li><a href="#jdbc-to-other-databases">JDBC To Other Databases</a></li>
- <li><a href="#troubleshooting">Troubleshooting</a></li>
- </ul>
- </li>
- <li><a href="#performance-tuning">Performance Tuning</a> <ul>
- <li><a href="#caching-data-in-memory">Caching Data In Memory</a></li>
- <li><a href="#other-configuration-options">Other Configuration Options</a></li>
- </ul>
- </li>
- <li><a href="#distributed-sql-engine">Distributed SQL Engine</a> <ul>
- <li><a href="#running-the-thrift-jdbcodbc-server">Running the Thrift JDBC/ODBC server</a></li>
- <li><a href="#running-the-spark-sql-cli">Running the Spark SQL CLI</a></li>
- </ul>
- </li>
- <li><a href="#migration-guide">Migration Guide</a> <ul>
- <li><a href="#upgrading-from-spark-sql-14-to-15">Upgrading From Spark SQL 1.4 to 1.5</a></li>
- <li><a href="#upgrading-from-spark-sql-13-to-14">Upgrading from Spark SQL 1.3 to 1.4</a> <ul>
- <li><a href="#dataframe-data-readerwriter-interface">DataFrame data reader/writer interface</a></li>
- <li><a href="#dataframegroupby-retains-grouping-columns">DataFrame.groupBy retains grouping columns</a></li>
- </ul>
- </li>
- <li><a href="#upgrading-from-spark-sql-10-12-to-13">Upgrading from Spark SQL 1.0-1.2 to 1.3</a> <ul>
- <li><a href="#rename-of-schemardd-to-dataframe">Rename of SchemaRDD to DataFrame</a></li>
- <li><a href="#unification-of-the-java-and-scala-apis">Unification of the Java and Scala APIs</a></li>
- <li><a href="#isolation-of-implicit-conversions-and-removal-of-dsl-package-scala-only">Isolation of Implicit Conversions and Removal of dsl Package (Scala-only)</a></li>
- <li><a href="#removal-of-the-type-aliases-in-orgapachesparksql-for-datatype-scala-only">Removal of the type aliases in org.apache.spark.sql for DataType (Scala-only)</a></li>
- <li><a href="#udf-registration-moved-to-sqlcontextudf-java--scala">UDF Registration Moved to <code>sqlContext.udf</code> (Java &amp; Scala)</a></li>
- <li><a href="#python-datatypes-no-longer-singletons">Python DataTypes No Longer Singletons</a></li>
- </ul>
- </li>
- <li><a href="#migration-guide-for-shark-users">Migration Guide for Shark Users</a> <ul>
- <li><a href="#scheduling">Scheduling</a></li>
- <li><a href="#reducer-number">Reducer number</a></li>
- <li><a href="#caching">Caching</a></li>
- </ul>
- </li>
- <li><a href="#compatibility-with-apache-hive">Compatibility with Apache Hive</a> <ul>
- <li><a href="#deploying-in-existing-hive-warehouses">Deploying in Existing Hive Warehouses</a></li>
- <li><a href="#supported-hive-features">Supported Hive Features</a></li>
- <li><a href="#unsupported-hive-functionality">Unsupported Hive Functionality</a></li>
- </ul>
- </li>
- </ul>
- </li>
- <li><a href="#reference">Reference</a> <ul>
- <li><a href="#data-types">Data Types</a></li>
- <li><a href="#nan-semantics">NaN Semantics</a></li>
- </ul>
- </li>
-</ul>
-
-<h1 id="overview">Overview</h1>
-
-<p>Spark SQL is a Spark module for structured data processing. It provides a programming abstraction called DataFrames and can also act as distributed SQL query engine.</p>
-
-<p>Spark SQL can also be used to read data from an existing Hive installation. For more on how to configure this feature, please refer to the <a href="#hive-tables">Hive Tables</a> section.</p>
-
-<h1 id="dataframes">DataFrames</h1>
-
-<p>A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. DataFrames can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing RDDs.</p>
-
-<p>The DataFrame API is available in <a href="api/scala/index.html#org.apache.spark.sql.DataFrame">Scala</a>, <a href="api/java/index.html?org/apache/spark/sql/DataFrame.html">Java</a>, <a href="api/python/pyspark.sql.html#pyspark.sql.DataFrame">Python</a>, and <a href="api/R/index.html">R</a>.</p>
-
-<p>All of the examples on this page use sample data included in the Spark distribution and can be run in the <code>spark-shell</code>, <code>pyspark</code> shell, or <code>sparkR</code> shell.</p>
-
-<h2 id="starting-point-sqlcontext">Starting Point: SQLContext</h2>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <p>The entry point into all functionality in Spark SQL is the
-<a href="api/scala/index.html#org.apache.spark.sql.SQLContext"><code>SQLContext</code></a> class, or one of its
-descendants. To create a basic <code>SQLContext</code>, all you need is a SparkContext.</p>
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span> <span class="c1">// An existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="c1">// this is used to implicitly convert an RDD to a DataFrame.</span>
-<span class="k">import</span> <span class="nn">sqlContext.implicits._</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <p>The entry point into all functionality in Spark SQL is the
-<a href="api/java/index.html#org.apache.spark.sql.SQLContext"><code>SQLContext</code></a> class, or one of its
-descendants. To create a basic <code>SQLContext</code>, all you need is a SparkContext.</p>
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="o">...;</span> <span class="c1">// An existing JavaSparkContext.</span>
-<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <p>The entry point into all relational functionality in Spark is the
-<a href="api/python/pyspark.sql.html#pyspark.sql.SQLContext"><code>SQLContext</code></a> class, or one
-of its decedents. To create a basic <code>SQLContext</code>, all you need is a SparkContext.</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <p>The entry point into all relational functionality in Spark is the
-<code>SQLContext</code> class, or one of its decedents. To create a basic <code>SQLContext</code>, all you need is a SparkContext.</p>
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">sqlContext <span class="o">&lt;-</span> sparkRSQL.init<span class="p">(</span>sc<span class="p">)</span></code></pre></div>
-
- </div>
-</div>
-
-<p>In addition to the basic <code>SQLContext</code>, you can also create a <code>HiveContext</code>, which provides a
-superset of the functionality provided by the basic <code>SQLContext</code>. Additional features include
-the ability to write queries using the more complete HiveQL parser, access to Hive UDFs, and the
-ability to read data from Hive tables. To use a <code>HiveContext</code>, you do not need to have an
-existing Hive setup, and all of the data sources available to a <code>SQLContext</code> are still available.
-<code>HiveContext</code> is only packaged separately to avoid including all of Hive&#8217;s dependencies in the default
-Spark build. If these dependencies are not a problem for your application then using <code>HiveContext</code>
-is recommended for the 1.3 release of Spark. Future releases will focus on bringing <code>SQLContext</code> up
-to feature parity with a <code>HiveContext</code>.</p>
-
-<p>The specific variant of SQL that is used to parse queries can also be selected using the
-<code>spark.sql.dialect</code> option. This parameter can be changed using either the <code>setConf</code> method on
-a <code>SQLContext</code> or by using a <code>SET key=value</code> command in SQL. For a <code>SQLContext</code>, the only dialect
-available is &#8220;sql&#8221; which uses a simple SQL parser provided by Spark SQL. In a <code>HiveContext</code>, the
-default is &#8220;hiveql&#8221;, though &#8220;sql&#8221; is also available. Since the HiveQL parser is much more complete,
-this is recommended for most use cases.</p>
-
-<h2 id="creating-dataframes">Creating DataFrames</h2>
-
-<p>With a <code>SQLContext</code>, applications can create <code>DataFrame</code>s from an <a href="#interoperating-with-rdds">existing <code>RDD</code></a>, from a Hive table, or from <a href="#data-sources">data sources</a>.</p>
-
-<p>As an example, the following creates a <code>DataFrame</code> based on the content of a JSON file:</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span> <span class="c1">// An existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">)</span>
-
-<span class="c1">// Displays the content of the DataFrame to stdout</span>
-<span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="o">()</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="o">...;</span> <span class="c1">// An existing JavaSparkContext.</span>
-<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
-
-<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">);</span>
-
-<span class="c1">// Displays the content of the DataFrame to stdout</span>
-<span class="n">df</span><span class="o">.</span><span class="na">show</span><span class="o">();</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-
-<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="p">)</span>
-
-<span class="c"># Displays the content of the DataFrame to stdout</span>
-<span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">sqlContext <span class="o">&lt;-</span> SQLContext<span class="p">(</span>sc<span class="p">)</span>
-
-df <span class="o">&lt;-</span> jsonFile<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="p">)</span>
-
-<span class="c1"># Displays the content of the DataFrame to stdout</span>
-showDF<span class="p">(</span>df<span class="p">)</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h2 id="dataframe-operations">DataFrame Operations</h2>
-
-<p>DataFrames provide a domain-specific language for structured data manipulation in <a href="api/scala/index.html#org.apache.spark.sql.DataFrame">Scala</a>, <a href="api/java/index.html?org/apache/spark/sql/DataFrame.html">Java</a>, and <a href="api/python/pyspark.sql.html#pyspark.sql.DataFrame">Python</a>.</p>
-
-<p>Here we include some basic examples of structured data processing using DataFrames:</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">sc</span><span class="k">:</span> <span class="kt">SparkContext</span> <span class="c1">// An existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="c1">// Create the DataFrame</span>
-<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">)</span>
-
-<span class="c1">// Show the content of the DataFrame</span>
-<span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="o">()</span>
-<span class="c1">// age name</span>
-<span class="c1">// null Michael</span>
-<span class="c1">// 30 Andy</span>
-<span class="c1">// 19 Justin</span>
-
-<span class="c1">// Print the schema in a tree format</span>
-<span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="o">()</span>
-<span class="c1">// root</span>
-<span class="c1">// |-- age: long (nullable = true)</span>
-<span class="c1">// |-- name: string (nullable = true)</span>
-
-<span class="c1">// Select only the &quot;name&quot; column</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">).</span><span class="n">show</span><span class="o">()</span>
-<span class="c1">// name</span>
-<span class="c1">// Michael</span>
-<span class="c1">// Andy</span>
-<span class="c1">// Justin</span>
-
-<span class="c1">// Select everybody, but increment the age by 1</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="n">df</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">),</span> <span class="n">df</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">)</span> <span class="o">+</span> <span class="mi">1</span><span class="o">).</span><span class="n">show</span><span class="o">()</span>
-<span class="c1">// name (age + 1)</span>
-<span class="c1">// Michael null</span>
-<span class="c1">// Andy 31</span>
-<span class="c1">// Justin 20</span>
-
-<span class="c1">// Select people older than 21</span>
-<span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="n">df</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">)</span> <span class="o">&gt;</span> <span class="mi">21</span><span class="o">).</span><span class="n">show</span><span class="o">()</span>
-<span class="c1">// age name</span>
-<span class="c1">// 30 Andy</span>
-
-<span class="c1">// Count people by age</span>
-<span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">).</span><span class="n">count</span><span class="o">().</span><span class="n">show</span><span class="o">()</span>
-<span class="c1">// age count</span>
-<span class="c1">// null 1</span>
-<span class="c1">// 19 1</span>
-<span class="c1">// 30 1</span></code></pre></div>
-
- <p>For a complete list of the types of operations that can be performed on a DataFrame refer to the <a href="api/scala/index.html#org.apache.spark.sql.DataFrame">API Documentation</a>.</p>
-
- <p>In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/scala/index.html#org.apache.spark.sql.DataFrame">DataFrame Function Reference</a>.</p>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="c1">// An existing SparkContext.</span>
-<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="c1">// Create the DataFrame</span>
-<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">);</span>
-
-<span class="c1">// Show the content of the DataFrame</span>
-<span class="n">df</span><span class="o">.</span><span class="na">show</span><span class="o">();</span>
-<span class="c1">// age name</span>
-<span class="c1">// null Michael</span>
-<span class="c1">// 30 Andy</span>
-<span class="c1">// 19 Justin</span>
-
-<span class="c1">// Print the schema in a tree format</span>
-<span class="n">df</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span>
-<span class="c1">// root</span>
-<span class="c1">// |-- age: long (nullable = true)</span>
-<span class="c1">// |-- name: string (nullable = true)</span>
-
-<span class="c1">// Select only the &quot;name&quot; column</span>
-<span class="n">df</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">).</span><span class="na">show</span><span class="o">();</span>
-<span class="c1">// name</span>
-<span class="c1">// Michael</span>
-<span class="c1">// Andy</span>
-<span class="c1">// Justin</span>
-
-<span class="c1">// Select everybody, but increment the age by 1</span>
-<span class="n">df</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="n">df</span><span class="o">.</span><span class="na">col</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">),</span> <span class="n">df</span><span class="o">.</span><span class="na">col</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">).</span><span class="na">plus</span><span class="o">(</span><span class="mi">1</span><span class="o">)).</span><span class="na">show</span><span class="o">();</span>
-<span class="c1">// name (age + 1)</span>
-<span class="c1">// Michael null</span>
-<span class="c1">// Andy 31</span>
-<span class="c1">// Justin 20</span>
-
-<span class="c1">// Select people older than 21</span>
-<span class="n">df</span><span class="o">.</span><span class="na">filter</span><span class="o">(</span><span class="n">df</span><span class="o">.</span><span class="na">col</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">).</span><span class="na">gt</span><span class="o">(</span><span class="mi">21</span><span class="o">)).</span><span class="na">show</span><span class="o">();</span>
-<span class="c1">// age name</span>
-<span class="c1">// 30 Andy</span>
-
-<span class="c1">// Count people by age</span>
-<span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">).</span><span class="na">count</span><span class="o">().</span><span class="na">show</span><span class="o">();</span>
-<span class="c1">// age count</span>
-<span class="c1">// null 1</span>
-<span class="c1">// 19 1</span>
-<span class="c1">// 30 1</span></code></pre></div>
-
- <p>For a complete list of the types of operations that can be performed on a DataFrame refer to the <a href="api/java/org/apache/spark/sql/DataFrame.html">API Documentation</a>.</p>
-
- <p>In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/java/org/apache/spark/sql/functions.html">DataFrame Function Reference</a>.</p>
-
- </div>
-
-<div data-lang="python">
- <p>In Python it&#8217;s possible to access a DataFrame&#8217;s columns either by attribute
-(<code>df.age</code>) or by indexing (<code>df['age']</code>). While the former is convenient for
-interactive data exploration, users are highly encouraged to use the
-latter form, which is future proof and won&#8217;t break with column names that
-are also attributes on the DataFrame class.</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-
-<span class="c"># Create the DataFrame</span>
-<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="p">)</span>
-
-<span class="c"># Show the content of the DataFrame</span>
-<span class="n">df</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
-<span class="c">## age name</span>
-<span class="c">## null Michael</span>
-<span class="c">## 30 Andy</span>
-<span class="c">## 19 Justin</span>
-
-<span class="c"># Print the schema in a tree format</span>
-<span class="n">df</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
-<span class="c">## root</span>
-<span class="c">## |-- age: long (nullable = true)</span>
-<span class="c">## |-- name: string (nullable = true)</span>
-
-<span class="c"># Select only the &quot;name&quot; column</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;name&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
-<span class="c">## name</span>
-<span class="c">## Michael</span>
-<span class="c">## Andy</span>
-<span class="c">## Justin</span>
-
-<span class="c"># Select everybody, but increment the age by 1</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s">&#39;name&#39;</span><span class="p">],</span> <span class="n">df</span><span class="p">[</span><span class="s">&#39;age&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
-<span class="c">## name (age + 1)</span>
-<span class="c">## Michael null</span>
-<span class="c">## Andy 31</span>
-<span class="c">## Justin 20</span>
-
-<span class="c"># Select people older than 21</span>
-<span class="n">df</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s">&#39;age&#39;</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mi">21</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
-<span class="c">## age name</span>
-<span class="c">## 30 Andy</span>
-
-<span class="c"># Count people by age</span>
-<span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">&quot;age&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
-<span class="c">## age count</span>
-<span class="c">## null 1</span>
-<span class="c">## 19 1</span>
-<span class="c">## 30 1</span></code></pre></div>
-
- <p>For a complete list of the types of operations that can be performed on a DataFrame refer to the <a href="api/python/pyspark.sql.html#pyspark.sql.DataFrame">API Documentation</a>.</p>
-
- <p>In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/python/pyspark.sql.html#module-pyspark.sql.functions">DataFrame Function Reference</a>.</p>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">sqlContext <span class="o">&lt;-</span> sparkRSQL.init<span class="p">(</span>sc<span class="p">)</span>
-
-<span class="c1"># Create the DataFrame</span>
-df <span class="o">&lt;-</span> jsonFile<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="p">)</span>
-
-<span class="c1"># Show the content of the DataFrame</span>
-showDF<span class="p">(</span>df<span class="p">)</span>
-<span class="c1">## age name</span>
-<span class="c1">## null Michael</span>
-<span class="c1">## 30 Andy</span>
-<span class="c1">## 19 Justin</span>
-
-<span class="c1"># Print the schema in a tree format</span>
-printSchema<span class="p">(</span>df<span class="p">)</span>
-<span class="c1">## root</span>
-<span class="c1">## |-- age: long (nullable = true)</span>
-<span class="c1">## |-- name: string (nullable = true)</span>
-
-<span class="c1"># Select only the &quot;name&quot; column</span>
-showDF<span class="p">(</span>select<span class="p">(</span>df<span class="p">,</span> <span class="s">&quot;name&quot;</span><span class="p">))</span>
-<span class="c1">## name</span>
-<span class="c1">## Michael</span>
-<span class="c1">## Andy</span>
-<span class="c1">## Justin</span>
-
-<span class="c1"># Select everybody, but increment the age by 1</span>
-showDF<span class="p">(</span>select<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>name<span class="p">,</span> df<span class="o">$</span>age <span class="o">+</span> <span class="m">1</span><span class="p">))</span>
-<span class="c1">## name (age + 1)</span>
-<span class="c1">## Michael null</span>
-<span class="c1">## Andy 31</span>
-<span class="c1">## Justin 20</span>
-
-<span class="c1"># Select people older than 21</span>
-showDF<span class="p">(</span>where<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>age <span class="o">&gt;</span> <span class="m">21</span><span class="p">))</span>
-<span class="c1">## age name</span>
-<span class="c1">## 30 Andy</span>
-
-<span class="c1"># Count people by age</span>
-showDF<span class="p">(</span>count<span class="p">(</span>groupBy<span class="p">(</span>df<span class="p">,</span> <span class="s">&quot;age&quot;</span><span class="p">)))</span>
-<span class="c1">## age count</span>
-<span class="c1">## null 1</span>
-<span class="c1">## 19 1</span>
-<span class="c1">## 30 1</span></code></pre></div>
-
- <p>For a complete list of the types of operations that can be performed on a DataFrame refer to the <a href="api/R/index.html">API Documentation</a>.</p>
-
- <p>In addition to simple column references and expressions, DataFrames also have a rich library of functions including string manipulation, date arithmetic, common math operations and more. The complete list is available in the <a href="api/R/index.html">DataFrame Function Reference</a>.</p>
-
- </div>
-
-</div>
-
-<h2 id="running-sql-queries-programmatically">Running SQL Queries Programmatically</h2>
-
-<p>The <code>sql</code> function on a <code>SQLContext</code> enables applications to run SQL queries programmatically and returns the result as a <code>DataFrame</code>.</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="o">...</span> <span class="c1">// An existing SQLContext</span>
-<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT * FROM table&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="o">...</span> <span class="c1">// An existing SQLContext</span>
-<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT * FROM table&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT * FROM table&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">sqlContext <span class="o">&lt;-</span> sparkRSQL.init<span class="p">(</span>sc<span class="p">)</span>
-df <span class="o">&lt;-</span> sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;SELECT * FROM table&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h2 id="interoperating-with-rdds">Interoperating with RDDs</h2>
-
-<p>Spark SQL supports two different methods for converting existing RDDs into DataFrames. The first
-method uses reflection to infer the schema of an RDD that contains specific types of objects. This
-reflection based approach leads to more concise code and works well when you already know the schema
-while writing your Spark application.</p>
-
-<p>The second method for creating DataFrames is through a programmatic interface that allows you to
-construct a schema and then apply it to an existing RDD. While this method is more verbose, it allows
-you to construct DataFrames when the columns and their types are not known until runtime.</p>
-
-<h3 id="inferring-the-schema-using-reflection">Inferring the Schema Using Reflection</h3>
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <p>The Scala interface for Spark SQL supports automatically converting an RDD containing case classes
-to a DataFrame. The case class
-defines the schema of the table. The names of the arguments to the case class are read using
-reflection and become the names of the columns. Case classes can also be nested or contain complex
-types such as Sequences or Arrays. This RDD can be implicitly converted to a DataFrame and then be
-registered as a table. Tables can be used in subsequent SQL statements.</p>
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sc is an existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-<span class="c1">// this is used to implicitly convert an RDD to a DataFrame.</span>
-<span class="k">import</span> <span class="nn">sqlContext.implicits._</span>
-
-<span class="c1">// Define the schema using a case class.</span>
-<span class="c1">// Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,</span>
-<span class="c1">// you can use custom classes that implement the Product interface.</span>
-<span class="k">case</span> <span class="k">class</span> <span class="nc">Person</span><span class="o">(</span><span class="n">name</span><span class="k">:</span> <span class="kt">String</span><span class="o">,</span> <span class="n">age</span><span class="k">:</span> <span class="kt">Int</span><span class="o">)</span>
-
-<span class="c1">// Create an RDD of Person objects and register it as a table.</span>
-<span class="k">val</span> <span class="n">people</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot;,&quot;</span><span class="o">)).</span><span class="n">map</span><span class="o">(</span><span class="n">p</span> <span class="k">=&gt;</span> <span class="nc">Person</span><span class="o">(</span><span class="n">p</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span> <span class="n">p</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">trim</span><span class="o">.</span><span class="n">toInt</span><span class="o">)).</span><span class="n">toDF</span><span class="o">()</span>
-<span class="n">people</span><span class="o">.</span><span class="n">registerTempTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">)</span>
-
-<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
-<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name, age FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
-
-<span class="c1">// The results of SQL queries are DataFrames and support all the normal RDD operations.</span>
-<span class="c1">// The columns of a row in the result can be accessed by field index:</span>
-<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
-
-<span class="c1">// or by field name:</span>
-<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">.</span><span class="n">getAs</span><span class="o">[</span><span class="kt">String</span><span class="o">](</span><span class="s">&quot;name&quot;</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
-
-<span class="c1">// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]</span>
-<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">getValuesMap</span><span class="o">[</span><span class="kt">Any</span><span class="o">](</span><span class="nc">List</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">,</span> <span class="s">&quot;age&quot;</span><span class="o">))).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
-<span class="c1">// Map(&quot;name&quot; -&gt; &quot;Justin&quot;, &quot;age&quot; -&gt; 19)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <p>Spark SQL supports automatically converting an RDD of <a href="http://stackoverflow.com/questions/3295496/what-is-a-javabean-exactly">JavaBeans</a>
-into a DataFrame. The BeanInfo, obtained using reflection, defines the schema of the table.
-Currently, Spark SQL does not support JavaBeans that contain
-nested or contain complex types such as Lists or Arrays. You can create a JavaBean by creating a
-class that implements Serializable and has getters and setters for all of its fields.</p>
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Person</span> <span class="kd">implements</span> <span class="n">Serializable</span> <span class="o">{</span>
- <span class="kd">private</span> <span class="n">String</span> <span class="n">name</span><span class="o">;</span>
- <span class="kd">private</span> <span class="kt">int</span> <span class="n">age</span><span class="o">;</span>
-
- <span class="kd">public</span> <span class="n">String</span> <span class="nf">getName</span><span class="o">()</span> <span class="o">{</span>
- <span class="k">return</span> <span class="n">name</span><span class="o">;</span>
- <span class="o">}</span>
-
- <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setName</span><span class="o">(</span><span class="n">String</span> <span class="n">name</span><span class="o">)</span> <span class="o">{</span>
- <span class="k">this</span><span class="o">.</span><span class="na">name</span> <span class="o">=</span> <span class="n">name</span><span class="o">;</span>
- <span class="o">}</span>
-
- <span class="kd">public</span> <span class="kt">int</span> <span class="nf">getAge</span><span class="o">()</span> <span class="o">{</span>
- <span class="k">return</span> <span class="n">age</span><span class="o">;</span>
- <span class="o">}</span>
-
- <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setAge</span><span class="o">(</span><span class="kt">int</span> <span class="n">age</span><span class="o">)</span> <span class="o">{</span>
- <span class="k">this</span><span class="o">.</span><span class="na">age</span> <span class="o">=</span> <span class="n">age</span><span class="o">;</span>
- <span class="o">}</span>
-<span class="o">}</span></code></pre></div>
-
- <p>A schema can be applied to an existing RDD by calling <code>createDataFrame</code> and providing the Class object
-for the JavaBean.</p>
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// sc is an existing JavaSparkContext.</span>
-<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
-
-<span class="c1">// Load a text file and convert each line to a JavaBean.</span>
-<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Person</span><span class="o">&gt;</span> <span class="n">people</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="o">).</span><span class="na">map</span><span class="o">(</span>
- <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Person</span><span class="o">&gt;()</span> <span class="o">{</span>
- <span class="kd">public</span> <span class="n">Person</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">line</span><span class="o">)</span> <span class="kd">throws</span> <span class="n">Exception</span> <span class="o">{</span>
- <span class="n">String</span><span class="o">[]</span> <span class="n">parts</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot;,&quot;</span><span class="o">);</span>
-
- <span class="n">Person</span> <span class="n">person</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Person</span><span class="o">();</span>
- <span class="n">person</span><span class="o">.</span><span class="na">setName</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">0</span><span class="o">]);</span>
- <span class="n">person</span><span class="o">.</span><span class="na">setAge</span><span class="o">(</span><span class="n">Integer</span><span class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">1</span><span class="o">].</span><span class="na">trim</span><span class="o">()));</span>
-
- <span class="k">return</span> <span class="n">person</span><span class="o">;</span>
- <span class="o">}</span>
- <span class="o">});</span>
-
-<span class="c1">// Apply a schema to an RDD of JavaBeans and register it as a table.</span>
-<span class="n">DataFrame</span> <span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">people</span><span class="o">,</span> <span class="n">Person</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
-<span class="n">schemaPeople</span><span class="o">.</span><span class="na">registerTempTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">);</span>
-
-<span class="c1">// SQL can be run over RDDs that have been registered as tables.</span>
-<span class="n">DataFrame</span> <span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
-
-<span class="c1">// The results of SQL queries are DataFrames and support all the normal RDD operations.</span>
-<span class="c1">// The columns of a row in the result can be accessed by ordinal.</span>
-<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">teenagerNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="na">javaRDD</span><span class="o">().</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
- <span class="kd">public</span> <span class="n">String</span> <span class="nf">call</span><span class="o">(</span><span class="n">Row</span> <span class="n">row</span><span class="o">)</span> <span class="o">{</span>
- <span class="k">return</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
- <span class="o">}</span>
-<span class="o">}).</span><span class="na">collect</span><span class="o">();</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <p>Spark SQL can convert an RDD of Row objects to a DataFrame, inferring the datatypes. Rows are constructed by passing a list of
-key/value pairs as kwargs to the Row class. The keys of this list define the column names of the table,
-and the types are inferred by looking at the first row. Since we currently only look at the first
-row, it is important that there is no missing data in the first row of the RDD. In future versions we
-plan to more completely infer the schema by looking at more data, similar to the inference that is
-performed on JSON files.</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sc is an existing SparkContext.</span>
-<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span><span class="p">,</span> <span class="n">Row</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-
-<span class="c"># Load a text file and convert each line to a Row.</span>
-<span class="n">lines</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="p">)</span>
-<span class="n">parts</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">l</span><span class="p">:</span> <span class="n">l</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot;,&quot;</span><span class="p">))</span>
-<span class="n">people</span> <span class="o">=</span> <span class="n">parts</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">age</span><span class="o">=</span><span class="nb">int</span><span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">1</span><span class="p">])))</span>
-
-<span class="c"># Infer the schema, and register the DataFrame as a table.</span>
-<span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">people</span><span class="p">)</span>
-<span class="n">schemaPeople</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s">&quot;people&quot;</span><span class="p">)</span>
-
-<span class="c"># SQL can be run over DataFrames that have been registered as a table.</span>
-<span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
-
-<span class="c"># The results of SQL queries are RDDs and support all the normal RDD operations.</span>
-<span class="n">teenNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">p</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
-<span class="k">for</span> <span class="n">teenName</span> <span class="ow">in</span> <span class="n">teenNames</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span>
- <span class="k">print</span><span class="p">(</span><span class="n">teenName</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h3 id="programmatically-specifying-the-schema">Programmatically Specifying the Schema</h3>
-
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <p>When case classes cannot be defined ahead of time (for example,
-the structure of records is encoded in a string, or a text dataset will be parsed
-and fields will be projected differently for different users),
-a <code>DataFrame</code> can be created programmatically with three steps.</p>
-
- <ol>
- <li>Create an RDD of <code>Row</code>s from the original RDD;</li>
- <li>Create the schema represented by a <code>StructType</code> matching the structure of
-<code>Row</code>s in the RDD created in Step 1.</li>
- <li>Apply the schema to the RDD of <code>Row</code>s via <code>createDataFrame</code> method provided
-by <code>SQLContext</code>.</li>
- </ol>
-
- <p>For example:</p>
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sc is an existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="c1">// Create an RDD</span>
-<span class="k">val</span> <span class="n">people</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="o">)</span>
-
-<span class="c1">// The schema is encoded in a string</span>
-<span class="k">val</span> <span class="n">schemaString</span> <span class="k">=</span> <span class="s">&quot;name age&quot;</span>
-
-<span class="c1">// Import Row.</span>
-<span class="k">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
-
-<span class="c1">// Import Spark SQL data types</span>
-<span class="k">import</span> <span class="nn">org.apache.spark.sql.types.</span><span class="o">{</span><span class="nc">StructType</span><span class="o">,</span><span class="nc">StructField</span><span class="o">,</span><span class="nc">StringType</span><span class="o">};</span>
-
-<span class="c1">// Generate the schema based on the string of schema</span>
-<span class="k">val</span> <span class="n">schema</span> <span class="k">=</span>
- <span class="nc">StructType</span><span class="o">(</span>
- <span class="n">schemaString</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">fieldName</span> <span class="k">=&gt;</span> <span class="nc">StructField</span><span class="o">(</span><span class="n">fieldName</span><span class="o">,</span> <span class="nc">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">)))</span>
-
-<span class="c1">// Convert records of the RDD (people) to Rows.</span>
-<span class="k">val</span> <span class="n">rowRDD</span> <span class="k">=</span> <span class="n">people</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot;,&quot;</span><span class="o">)).</span><span class="n">map</span><span class="o">(</span><span class="n">p</span> <span class="k">=&gt;</span> <span class="nc">Row</span><span class="o">(</span><span class="n">p</span><span class="o">(</span><span class="mi">0</span><span class="o">),</span> <span class="n">p</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">trim</span><span class="o">))</span>
-
-<span class="c1">// Apply the schema to the RDD.</span>
-<span class="k">val</span> <span class="n">peopleDataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">rowRDD</span><span class="o">,</span> <span class="n">schema</span><span class="o">)</span>
-
-<span class="c1">// Register the DataFrames as a table.</span>
-<span class="n">peopleDataFrame</span><span class="o">.</span><span class="n">registerTempTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">)</span>
-
-<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
-<span class="k">val</span> <span class="n">results</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people&quot;</span><span class="o">)</span>
-
-<span class="c1">// The results of SQL queries are DataFrames and support all the normal RDD operations.</span>
-<span class="c1">// The columns of a row in the result can be accessed by field index or by field name.</span>
-<span class="n">results</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <p>When JavaBean classes cannot be defined ahead of time (for example,
-the structure of records is encoded in a string, or a text dataset will be parsed and
-fields will be projected differently for different users),
-a <code>DataFrame</code> can be created programmatically with three steps.</p>
-
- <ol>
- <li>Create an RDD of <code>Row</code>s from the original RDD;</li>
- <li>Create the schema represented by a <code>StructType</code> matching the structure of
-<code>Row</code>s in the RDD created in Step 1.</li>
- <li>Apply the schema to the RDD of <code>Row</code>s via <code>createDataFrame</code> method provided
-by <code>SQLContext</code>.</li>
- </ol>
-
- <p>For example:</p>
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
-<span class="c1">// Import factory methods provided by DataTypes.</span>
-<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
-<span class="c1">// Import StructType and StructField</span>
-<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
-<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
-<span class="c1">// Import Row.</span>
-<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
-<span class="c1">// Import RowFactory.</span>
-<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
-
-<span class="c1">// sc is an existing JavaSparkContext.</span>
-<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
-
-<span class="c1">// Load a text file and convert each line to a JavaBean.</span>
-<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">people</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="o">);</span>
-
-<span class="c1">// The schema is encoded in a string</span>
-<span class="n">String</span> <span class="n">schemaString</span> <span class="o">=</span> <span class="s">&quot;name age&quot;</span><span class="o">;</span>
-
-<span class="c1">// Generate the schema based on the string of schema</span>
-<span class="n">List</span><span class="o">&lt;</span><span class="n">StructField</span><span class="o">&gt;</span> <span class="n">fields</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o">&lt;</span><span class="n">StructField</span><span class="o">&gt;();</span>
-<span class="k">for</span> <span class="o">(</span><span class="n">String</span> <span class="nl">fieldName:</span> <span class="n">schemaString</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">))</span> <span class="o">{</span>
- <span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="n">fieldName</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">));</span>
-<span class="o">}</span>
-<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructType</span><span class="o">(</span><span class="n">fields</span><span class="o">);</span>
-
-<span class="c1">// Convert records of the RDD (people) to Rows.</span>
-<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">rowRDD</span> <span class="o">=</span> <span class="n">people</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
- <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Row</span><span class="o">&gt;()</span> <span class="o">{</span>
- <span class="kd">public</span> <span class="n">Row</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">record</span><span class="o">)</span> <span class="kd">throws</span> <span class="n">Exception</span> <span class="o">{</span>
- <span class="n">String</span><span class="o">[]</span> <span class="n">fields</span> <span class="o">=</span> <span class="n">record</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot;,&quot;</span><span class="o">);</span>
- <span class="k">return</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">fields</span><span class="o">[</span><span class="mi">0</span><span class="o">],</span> <span class="n">fields</span><span class="o">[</span><span class="mi">1</span><span class="o">].</span><span class="na">trim</span><span class="o">());</span>
- <span class="o">}</span>
- <span class="o">});</span>
-
-<span class="c1">// Apply the schema to the RDD.</span>
-<span class="n">DataFrame</span> <span class="n">peopleDataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rowRDD</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
-
-<span class="c1">// Register the DataFrame as a table.</span>
-<span class="n">peopleDataFrame</span><span class="o">.</span><span class="na">registerTempTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">);</span>
-
-<span class="c1">// SQL can be run over RDDs that have been registered as tables.</span>
-<span class="n">DataFrame</span> <span class="n">results</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people&quot;</span><span class="o">);</span>
-
-<span class="c1">// The results of SQL queries are DataFrames and support all the normal RDD operations.</span>
-<span class="c1">// The columns of a row in the result can be accessed by ordinal.</span>
-<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">names</span> <span class="o">=</span> <span class="n">results</span><span class="o">.</span><span class="na">javaRDD</span><span class="o">().</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
- <span class="kd">public</span> <span class="n">String</span> <span class="nf">call</span><span class="o">(</span><span class="n">Row</span> <span class="n">row</span><span class="o">)</span> <span class="o">{</span>
- <span class="k">return</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
- <span class="o">}</span>
-<span class="o">}).</span><span class="na">collect</span><span class="o">();</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <p>When a dictionary of kwargs cannot be defined ahead of time (for example,
-the structure of records is encoded in a string, or a text dataset will be parsed and
-fields will be projected differently for different users),
-a <code>DataFrame</code> can be created programmatically with three steps.</p>
-
- <ol>
- <li>Create an RDD of tuples or lists from the original RDD;</li>
- <li>Create the schema represented by a <code>StructType</code> matching the structure of
-tuples or lists in the RDD created in the step 1.</li>
- <li>Apply the schema to the RDD via <code>createDataFrame</code> method provided by <code>SQLContext</code>.</li>
- </ol>
-
- <p>For example:</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># Import SQLContext and data types</span>
-<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
-<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span>
-
-<span class="c"># sc is an existing SparkContext.</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-
-<span class="c"># Load a text file and convert each line to a tuple.</span>
-<span class="n">lines</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.txt&quot;</span><span class="p">)</span>
-<span class="n">parts</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">l</span><span class="p">:</span> <span class="n">l</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot;,&quot;</span><span class="p">))</span>
-<span class="n">people</span> <span class="o">=</span> <span class="n">parts</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">p</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">strip</span><span class="p">()))</span>
-
-<span class="c"># The schema is encoded in a string.</span>
-<span class="n">schemaString</span> <span class="o">=</span> <span class="s">&quot;name age&quot;</span>
-
-<span class="n">fields</span> <span class="o">=</span> <span class="p">[</span><span class="n">StructField</span><span class="p">(</span><span class="n">field_name</span><span class="p">,</span> <span class="n">StringType</span><span class="p">(),</span> <span class="bp">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field_name</span> <span class="ow">in</span> <span class="n">schemaString</span><span class="o">.</span><span class="n">split</span><span class="p">()]</span>
-<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">(</span><span class="n">fields</span><span class="p">)</span>
-
-<span class="c"># Apply the schema to the RDD.</span>
-<span class="n">schemaPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">people</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
-
-<span class="c"># Register the DataFrame as a table.</span>
-<span class="n">schemaPeople</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s">&quot;people&quot;</span><span class="p">)</span>
-
-<span class="c"># SQL can be run over DataFrames that have been registered as a table.</span>
-<span class="n">results</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM people&quot;</span><span class="p">)</span>
-
-<span class="c"># The results of SQL queries are RDDs and support all the normal RDD operations.</span>
-<span class="n">names</span> <span class="o">=</span> <span class="n">results</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">p</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
-<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">names</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span>
- <span class="k">print</span><span class="p">(</span><span class="n">name</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h1 id="data-sources">Data Sources</h1>
-
-<p>Spark SQL supports operating on a variety of data sources through the <code>DataFrame</code> interface.
-A DataFrame can be operated on as normal RDDs and can also be registered as a temporary table.
-Registering a DataFrame as a table allows you to run SQL queries over its data. This section
-describes the general methods for loading and saving data using the Spark Data Sources and then
-goes into specific options that are available for the built-in data sources.</p>
-
-<h2 id="generic-loadsave-functions">Generic Load/Save Functions</h2>
-
-<p>In the simplest form, the default data source (<code>parquet</code> unless otherwise configured by
-<code>spark.sql.sources.default</code>) will be used for all operations.</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">load</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/users.parquet&quot;</span><span class="o">)</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">,</span> <span class="s">&quot;favorite_color&quot;</span><span class="o">).</span><span class="n">write</span><span class="o">.</span><span class="n">save</span><span class="o">(</span><span class="s">&quot;namesAndFavColors.parquet&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">load</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/users.parquet&quot;</span><span class="o">);</span>
-<span class="n">df</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">,</span> <span class="s">&quot;favorite_color&quot;</span><span class="o">).</span><span class="na">write</span><span class="o">().</span><span class="na">save</span><span class="o">(</span><span class="s">&quot;namesAndFavColors.parquet&quot;</span><span class="o">);</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/users.parquet&quot;</span><span class="p">)</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;name&quot;</span><span class="p">,</span> <span class="s">&quot;favorite_color&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s">&quot;namesAndFavColors.parquet&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">df <span class="o">&lt;-</span> loadDF<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
-saveDF<span class="p">(</span>select<span class="p">(</span>df<span class="p">,</span> <span class="s">&quot;name&quot;</span><span class="p">,</span> <span class="s">&quot;age&quot;</span><span class="p">),</span> <span class="s">&quot;namesAndAges.parquet&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-</div>
-
-<h3 id="manually-specifying-options">Manually Specifying Options</h3>
-
-<p>You can also manually specify the data source that will be used along with any extra options
-that you would like to pass to the data source. Data sources are specified by their fully qualified
-name (i.e., <code>org.apache.spark.sql.parquet</code>), but for built-in sources you can also use their short
-names (<code>json</code>, <code>parquet</code>, <code>jdbc</code>). DataFrames of any type can be converted into other types
-using this syntax.</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">&quot;json&quot;</span><span class="o">).</span><span class="n">load</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">)</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">,</span> <span class="s">&quot;age&quot;</span><span class="o">).</span><span class="n">write</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">&quot;parquet&quot;</span><span class="o">).</span><span class="n">save</span><span class="o">(</span><span class="s">&quot;namesAndAges.parquet&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;json&quot;</span><span class="o">).</span><span class="na">load</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">);</span>
-<span class="n">df</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;name&quot;</span><span class="o">,</span> <span class="s">&quot;age&quot;</span><span class="o">).</span><span class="na">write</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;parquet&quot;</span><span class="o">).</span><span class="na">save</span><span class="o">(</span><span class="s">&quot;namesAndAges.parquet&quot;</span><span class="o">);</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s">&quot;json&quot;</span><span class="p">)</span>
-<span class="n">df</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;name&quot;</span><span class="p">,</span> <span class="s">&quot;age&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s">&quot;namesAndAges.parquet&quot;</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s">&quot;parquet&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">df <span class="o">&lt;-</span> loadDF<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;people.json&quot;</span><span class="p">,</span> <span class="s">&quot;json&quot;</span><span class="p">)</span>
-saveDF<span class="p">(</span>select<span class="p">(</span>df<span class="p">,</span> <span class="s">&quot;name&quot;</span><span class="p">,</span> <span class="s">&quot;age&quot;</span><span class="p">),</span> <span class="s">&quot;namesAndAges.parquet&quot;</span><span class="p">,</span> <span class="s">&quot;parquet&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-</div>
-
-<h3 id="save-modes">Save Modes</h3>
-
-<p>Save operations can optionally take a <code>SaveMode</code>, that specifies how to handle existing data if
-present. It is important to realize that these save modes do not utilize any locking and are not
-atomic. Additionally, when performing a <code>Overwrite</code>, the data will be deleted before writing out the
-new data.</p>
-
-<table class="table">
-<tr><th>Scala/Java</th><th>Any Language</th><th>Meaning</th></tr>
-<tr>
- <td><code>SaveMode.ErrorIfExists</code> (default)</td>
- <td><code>"error"</code> (default)</td>
- <td>
- When saving a DataFrame to a data source, if data already exists,
- an exception is expected to be thrown.
- </td>
-</tr>
-<tr>
- <td><code>SaveMode.Append</code></td>
- <td><code>"append"</code></td>
- <td>
- When saving a DataFrame to a data source, if data/table already exists,
- contents of the DataFrame are expected to be appended to existing data.
- </td>
-</tr>
-<tr>
- <td><code>SaveMode.Overwrite</code></td>
- <td><code>"overwrite"</code></td>
- <td>
- Overwrite mode means that when saving a DataFrame to a data source,
- if data/table already exists, existing data is expected to be overwritten by the contents of
- the DataFrame.
- </td>
-</tr>
-<tr>
- <td><code>SaveMode.Ignore</code></td>
- <td><code>"ignore"</code></td>
- <td>
- Ignore mode means that when saving a DataFrame to a data source, if data already exists,
- the save operation is expected to not save the contents of the DataFrame and to not
- change the existing data. This is similar to a <code>CREATE TABLE IF NOT EXISTS</code> in SQL.
- </td>
-</tr>
-</table>
-
-<h3 id="saving-to-persistent-tables">Saving to Persistent Tables</h3>
-
-<p>When working with a <code>HiveContext</code>, <code>DataFrames</code> can also be saved as persistent tables using the
-<code>saveAsTable</code> command. Unlike the <code>registerTempTable</code> command, <code>saveAsTable</code> will materialize the
-contents of the dataframe and create a pointer to the data in the HiveMetastore. Persistent tables
-will still exist even after your Spark program has restarted, as long as you maintain your connection
-to the same metastore. A DataFrame for a persistent table can be created by calling the <code>table</code>
-method on a <code>SQLContext</code> with the name of the table.</p>
-
-<p>By default <code>saveAsTable</code> will create a &#8220;managed table&#8221;, meaning that the location of the data will
-be controlled by the metastore. Managed tables will also have their data deleted automatically
-when a table is dropped.</p>
-
-<h2 id="parquet-files">Parquet Files</h2>
-
-<p><a href="http://parquet.io">Parquet</a> is a columnar format that is supported by many other data processing systems.
-Spark SQL provides support for both reading and writing Parquet files that automatically preserves the schema
-of the original data.</p>
-
-<h3 id="loading-data-programmatically">Loading Data Programmatically</h3>
-
-<p>Using the data from the above example:</p>
-
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sqlContext from the previous example is used in this example.</span>
-<span class="c1">// This is used to implicitly convert an RDD to a DataFrame.</span>
-<span class="k">import</span> <span class="nn">sqlContext.implicits._</span>
-
-<span class="k">val</span> <span class="n">people</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[</span><span class="kt">Person</span><span class="o">]</span> <span class="k">=</span> <span class="o">...</span> <span class="c1">// An RDD of case class objects, from the previous example.</span>
-
-<span class="c1">// The RDD is implicitly converted to a DataFrame by implicits, allowing it to be stored using Parquet.</span>
-<span class="n">people</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">)</span>
-
-<span class="c1">// Read in the parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
-<span class="c1">// The result of loading a Parquet file is also a DataFrame.</span>
-<span class="k">val</span> <span class="n">parquetFile</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">)</span>
-
-<span class="c1">//Parquet files can also be registered as tables and then used in SQL statements.</span>
-<span class="n">parquetFile</span><span class="o">.</span><span class="n">registerTempTable</span><span class="o">(</span><span class="s">&quot;parquetFile&quot;</span><span class="o">)</span>
-<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
-<span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">t</span> <span class="k">=&gt;</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">t</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// sqlContext from the previous example is used in this example.</span>
-
-<span class="n">DataFrame</span> <span class="n">schemaPeople</span> <span class="o">=</span> <span class="o">...</span> <span class="c1">// The DataFrame from the previous example.</span>
-
-<span class="c1">// DataFrames can be saved as Parquet files, maintaining the schema information.</span>
-<span class="n">schemaPeople</span><span class="o">.</span><span class="na">write</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">);</span>
-
-<span class="c1">// Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
-<span class="c1">// The result of loading a parquet file is also a DataFrame.</span>
-<span class="n">DataFrame</span> <span class="n">parquetFile</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">parquet</span><span class="o">(</span><span class="s">&quot;people.parquet&quot;</span><span class="o">);</span>
-
-<span class="c1">// Parquet files can also be registered as tables and then used in SQL statements.</span>
-<span class="n">parquetFile</span><span class="o">.</span><span class="na">registerTempTable</span><span class="o">(</span><span class="s">&quot;parquetFile&quot;</span><span class="o">);</span>
-<span class="n">DataFrame</span> <span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">);</span>
-<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">teenagerNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="na">javaRDD</span><span class="o">().</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
- <span class="kd">public</span> <span class="n">String</span> <span class="nf">call</span><span class="o">(</span><span class="n">Row</span> <span class="n">row</span><span class="o">)</span> <span class="o">{</span>
- <span class="k">return</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">getString</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
- <span class="o">}</span>
-<span class="o">}).</span><span class="na">collect</span><span class="o">();</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sqlContext from the previous example is used in this example.</span>
-
-<span class="n">schemaPeople</span> <span class="c"># The DataFrame from the previous example.</span>
-
-<span class="c"># DataFrames can be saved as Parquet files, maintaining the schema information.</span>
-<span class="n">schemaPeople</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
-
-<span class="c"># Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
-<span class="c"># The result of loading a parquet file is also a DataFrame.</span>
-<span class="n">parquetFile</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
-
-<span class="c"># Parquet files can also be registered as tables and then used in SQL statements.</span>
-<span class="n">parquetFile</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s">&quot;parquetFile&quot;</span><span class="p">);</span>
-<span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
-<span class="n">teenNames</span> <span class="o">=</span> <span class="n">teenagers</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="s">&quot;Name: &quot;</span> <span class="o">+</span> <span class="n">p</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
-<span class="k">for</span> <span class="n">teenName</span> <span class="ow">in</span> <span class="n">teenNames</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span>
- <span class="k">print</span><span class="p">(</span><span class="n">teenName</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># sqlContext from the previous example is used in this example.</span>
-
-schemaPeople <span class="c1"># The DataFrame from the previous example.</span>
-
-<span class="c1"># DataFrames can be saved as Parquet files, maintaining the schema information.</span>
-saveAsParquetFile<span class="p">(</span>schemaPeople<span class="p">,</span> <span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
-
-<span class="c1"># Read in the Parquet file created above. Parquet files are self-describing so the schema is preserved.</span>
-<span class="c1"># The result of loading a parquet file is also a DataFrame.</span>
-parquetFile <span class="o">&lt;-</span> parquetFile<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;people.parquet&quot;</span><span class="p">)</span>
-
-<span class="c1"># Parquet files can also be registered as tables and then used in SQL statements.</span>
-registerTempTable<span class="p">(</span>parquetFile<span class="p">,</span> <span class="s">&quot;parquetFile&quot;</span><span class="p">);</span>
-teenagers <span class="o">&lt;-</span> sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;SELECT name FROM parquetFile WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
-teenNames <span class="o">&lt;-</span> map<span class="p">(</span>teenagers<span class="p">,</span> <span class="kr">function</span><span class="p">(</span>p<span class="p">)</span> <span class="p">{</span> <span class="kp">paste</span><span class="p">(</span><span class="s">&quot;Name:&quot;</span><span class="p">,</span> p<span class="o">$</span>name<span class="p">)})</span>
-<span class="kr">for</span> <span class="p">(</span>teenName <span class="kr">in</span> collect<span class="p">(</span>teenNames<span class="p">))</span> <span class="p">{</span>
- <span class="kp">cat</span><span class="p">(</span>teenName<span class="p">,</span> <span class="s">&quot;\n&quot;</span><span class="p">)</span>
-<span class="p">}</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sqlContext is an existing HiveContext</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;REFRESH TABLE my_table&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="sql">
-
- <div class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TEMPORARY</span> <span class="k">TABLE</span> <span class="n">parquetTable</span>
-<span class="k">USING</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">spark</span><span class="p">.</span><span class="k">sql</span><span class="p">.</span><span class="n">parquet</span>
-<span class="k">OPTIONS</span> <span class="p">(</span>
- <span class="n">path</span> <span class="ss">&quot;examples/src/main/resources/people.parquet&quot;</span>
-<span class="p">)</span>
-
-<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">parquetTable</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h3 id="partition-discovery">Partition Discovery</h3>
-
-<p>Table partitioning is a common optimization approach used in systems like Hive. In a partitioned
-table, data are usually stored in different directories, with partitioning column values encoded in
-the path of each partition directory. The Parquet data source is now able to discover and infer
-partitioning information automatically. For example, we can store all our previously used
-population data into a partitioned table using the following directory structure, with two extra
-columns, <code>gender</code> and <code>country</code> as partitioning columns:</p>
-
-<div class="highlight"><pre><code class="language-text" data-lang="text">path
-└── to
- └── table
- ├── gender=male
- │   ├── ...
- │   │
- │   ├── country=US
- │   │   └── data.parquet
- │   ├── country=CN
- │   │   └── data.parquet
- │   └── ...
- └── gender=female
-    ├── ...
-    │
-    ├── country=US
-    │   └── data.parquet
-    ├── country=CN
-    │   └── data.parquet
-    └── ...</code></pre></div>
-
-<p>By passing <code>path/to/table</code> to either <code>SQLContext.read.parquet</code> or <code>SQLContext.read.load</code>, Spark SQL
-will automatically extract the partitioning information from the paths.
-Now the schema of the returned DataFrame becomes:</p>
-
-<div class="highlight"><pre><code class="language-text" data-lang="text">root
-|-- name: string (nullable = true)
-|-- age: long (nullable = true)
-|-- gender: string (nullable = true)
-|-- country: string (nullable = true)</code></pre></div>
-
-<p>Notice that the data types of the partitioning columns are automatically inferred. Currently,
-numeric data types and string type are supported. Sometimes users may not want to automatically
-infer the data types of the partitioning columns. For these use cases, the automatic type inference
-can be configured by <code>spark.sql.sources.partitionColumnTypeInference.enabled</code>, which is default to
-<code>true</code>. When type inference is disabled, string type will be used for the partitioning columns.</p>
-
-<h3 id="schema-merging">Schema Merging</h3>
-
-<p>Like ProtocolBuffer, Avro, and Thrift, Parquet also supports schema evolution. Users can start with
-a simple schema, and gradually add more columns to the schema as needed. In this way, users may end
-up with multiple Parquet files with different but mutually compatible schemas. The Parquet data
-source is now able to automatically detect this case and merge schemas of all these files.</p>
-
-<p>Since schema merging is a relatively expensive operation, and is not a necessity in most cases, we
-turned it off by default starting from 1.5.0. You may enable it by</p>
-
-<ol>
- <li>setting data source option <code>mergeSchema</code> to <code>true</code> when reading Parquet files (as shown in the
-examples below), or</li>
- <li>setting the global SQL option <code>spark.sql.parquet.mergeSchema</code> to <code>true</code>.</li>
-</ol>
-
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sqlContext from the previous example is used in this example.</span>
-<span class="c1">// This is used to implicitly convert an RDD to a DataFrame.</span>
-<span class="k">import</span> <span class="nn">sqlContext.implicits._</span>
-
-<span class="c1">// Create a simple DataFrame, stored into a partition directory</span>
-<span class="k">val</span> <span class="n">df1</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">makeRDD</span><span class="o">(</span><span class="mi">1</span> <span class="n">to</span> <span class="mi">5</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">i</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="n">i</span><span class="o">,</span> <span class="n">i</span> <span class="o">*</span> <span class="mi">2</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;single&quot;</span><span class="o">,</span> <span class="s">&quot;double&quot;</span><span class="o">)</span>
-<span class="n">df1</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="o">(</span><span class="s">&quot;data/test_table/key=1&quot;</span><span class="o">)</span>
-
-<span class="c1">// Create another DataFrame in a new partition directory,</span>
-<span class="c1">// adding a new column and dropping an existing column</span>
-<span class="k">val</span> <span class="n">df2</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">makeRDD</span><span class="o">(</span><span class="mi">6</span> <span class="n">to</span> <span class="mi">10</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">i</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="n">i</span><span class="o">,</span> <span class="n">i</span> <span class="o">*</span> <span class="mi">3</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;single&quot;</span><span class="o">,</span> <span class="s">&quot;triple&quot;</span><span class="o">)</span>
-<span class="n">df2</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="o">(</span><span class="s">&quot;data/test_table/key=2&quot;</span><span class="o">)</span>
-
-<span class="c1">// Read the partitioned table</span>
-<span class="k">val</span> <span class="n">df3</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="o">(</span><span class="s">&quot;mergeSchema&quot;</span><span class="o">,</span> <span class="s">&quot;true&quot;</span><span class="o">).</span><span class="n">parquet</span><span class="o">(</span><span class="s">&quot;data/test_table&quot;</span><span class="o">)</span>
-<span class="n">df3</span><span class="o">.</span><span class="n">printSchema</span><span class="o">()</span>
-
-<span class="c1">// The final schema consists of all 3 columns in the Parquet files together</span>
-<span class="c1">// with the partitioning column appeared in the partition directory paths.</span>
-<span class="c1">// root</span>
-<span class="c1">// |-- single: int (nullable = true)</span>
-<span class="c1">// |-- double: int (nullable = true)</span>
-<span class="c1">// |-- triple: int (nullable = true)</span>
-<span class="c1">// |-- key : int (nullable = true)</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sqlContext from the previous example is used in this example.</span>
-
-<span class="c"># Create a simple DataFrame, stored into a partition directory</span>
-<span class="n">df1</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">6</span><span class="p">))</span>\
- <span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="n">i</span><span class="p">,</span> <span class="n">double</span><span class="o">=</span><span class="n">i</span> <span class="o">*</span> <span class="mi">2</span><span class="p">)))</span>
-<span class="n">df1</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">&quot;data/test_table/key=1&quot;</span><span class="p">)</span>
-
-<span class="c"># Create another DataFrame in a new partition directory,</span>
-<span class="c"># adding a new column and dropping an existing column</span>
-<span class="n">df2</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">6</span><span class="p">,</span> <span class="mi">11</span><span class="p">))</span>
- <span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="n">Row</span><span class="p">(</span><span class="n">single</span><span class="o">=</span><span class="n">i</span><span class="p">,</span> <span class="n">triple</span><span class="o">=</span><span class="n">i</span> <span class="o">*</span> <span class="mi">3</span><span class="p">)))</span>
-<span class="n">df2</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">&quot;data/test_table/key=2&quot;</span><span class="p">)</span>
-
-<span class="c"># Read the partitioned table</span>
-<span class="n">df3</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s">&quot;mergeSchema&quot;</span><span class="p">,</span> <span class="s">&quot;true&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">parquet</span><span class="p">(</span><span class="s">&quot;data/test_table&quot;</span><span class="p">)</span>
-<span class="n">df3</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
-
-<span class="c"># The final schema consists of all 3 columns in the Parquet files together</span>
-<span class="c"># with the partitioning column appeared in the partition directory paths.</span>
-<span class="c"># root</span>
-<span class="c"># |-- single: int (nullable = true)</span>
-<span class="c"># |-- double: int (nullable = true)</span>
-<span class="c"># |-- triple: int (nullable = true)</span>
-<span class="c"># |-- key : int (nullable = true)</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># sqlContext from the previous example is used in this example.</span>
-
-<span class="c1"># Create a simple DataFrame, stored into a partition directory</span>
-saveDF<span class="p">(</span>df1<span class="p">,</span> <span class="s">&quot;data/test_table/key=1&quot;</span><span class="p">,</span> <span class="s">&quot;parquet&quot;</span><span class="p">,</span> <span class="s">&quot;overwrite&quot;</span><span class="p">)</span>
-
-<span class="c1"># Create another DataFrame in a new partition directory,</span>
-<span class="c1"># adding a new column and dropping an existing column</span>
-saveDF<span class="p">(</span>df2<span class="p">,</span> <span class="s">&quot;data/test_table/key=2&quot;</span><span class="p">,</span> <span class="s">&quot;parquet&quot;</span><span class="p">,</span> <span class="s">&quot;overwrite&quot;</span><span class="p">)</span>
-
-<span class="c1"># Read the partitioned table</span>
-df3 <span class="o">&lt;-</span> loadDF<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;data/test_table&quot;</span><span class="p">,</span> <span class="s">&quot;parquet&quot;</span><span class="p">,</span> mergeSchema<span class="o">=</span><span class="s">&quot;true&quot;</span><span class="p">)</span>
-printSchema<span class="p">(</span>df3<span class="p">)</span>
-
-<span class="c1"># The final schema consists of all 3 columns in the Parquet files together</span>
-<span class="c1"># with the partitioning column appeared in the partition directory paths.</span>
-<span class="c1"># root</span>
-<span class="c1"># |-- single: int (nullable = true)</span>
-<span class="c1"># |-- double: int (nullable = true)</span>
-<span class="c1"># |-- triple: int (nullable = true)</span>
-<span class="c1"># |-- key : int (nullable = true)</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h3 id="hive-metastore-parquet-table-conversion">Hive metastore Parquet table conversion</h3>
-
-<p>When reading from and writing to Hive metastore Parquet tables, Spark SQL will try to use its own
-Parquet support instead of Hive SerDe for better performance. This behavior is controlled by the
-<code>spark.sql.hive.convertMetastoreParquet</code> configuration, and is turned on by default.</p>
-
-<h4 id="hiveparquet-schema-reconciliation">Hive/Parquet Schema Reconciliation</h4>
-
-<p>There are two key differences between Hive and Parquet from the perspective of table schema
-processing.</p>
-
-<ol>
- <li>Hive is case insensitive, while Parquet is not</li>
- <li>Hive considers all columns nullable, while nullability in Parquet is significant</li>
-</ol>
-
-<p>Due to this reason, we must reconcile Hive metastore schema with Parquet schema when converting a
-Hive metastore Parquet table to a Spark SQL Parquet table. The reconciliation rules are:</p>
-
-<ol>
- <li>
- <p>Fields that have the same name in both schema must have the same data type regardless of
-nullability. The reconciled field should have the data type of the Parquet side, so that
-nullability is respected.</p>
- </li>
- <li>
- <p>The reconciled schema contains exactly those fields defined in Hive metastore schema.</p>
-
- <ul>
- <li>Any fields that only appear in the Parquet schema are dropped in the reconciled schema.</li>
- <li>Any fileds that only appear in the Hive metastore schema are added as nullable field in the
-reconciled schema.</li>
- </ul>
- </li>
-</ol>
-
-<h4 id="metadata-refreshing">Metadata Refreshing</h4>
-
-<p>Spark SQL caches Parquet metadata for better performance. When Hive metastore Parquet table
-conversion is enabled, metadata of those converted tables are also cached. If these tables are
-updated by Hive or other external tools, you need to refresh them manually to ensure consistent
-metadata.</p>
-
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sqlContext is an existing HiveContext</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">refreshTable</span><span class="o">(</span><span class="s">&quot;my_table&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// sqlContext is an existing HiveContext</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="na">refreshTable</span><span class="o">(</span><span class="s">&quot;my_table&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sqlContext is an existing HiveContext</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">refreshTable</span><span class="p">(</span><span class="s">&quot;my_table&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="sql">
-
- <div class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="n">REFRESH</span> <span class="k">TABLE</span> <span class="n">my_table</span><span class="p">;</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h3 id="configuration">Configuration</h3>
-
-<p>Configuration of Parquet can be done using the <code>setConf</code> method on <code>SQLContext</code> or by running
-<code>SET key=value</code> commands using SQL.</p>
-
-<table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
-<tr>
- <td><code>spark.sql.parquet.binaryAsString</code></td>
- <td>false</td>
- <td>
- Some other Parquet-producing systems, in particular Impala, Hive, and older versions of Spark SQL, do
- not differentiate between binary data and strings when writing out the Parquet schema. This
- flag tells Spark SQL to interpret binary data as a string to provide compatibility with these systems.
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.parquet.int96AsTimestamp</code></td>
- <td>true</td>
- <td>
- Some Parquet-producing systems, in particular Impala and Hive, store Timestamp into INT96. This
- flag tells Spark SQL to interpret INT96 data as a timestamp to provide compatibility with these systems.
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.parquet.cacheMetadata</code></td>
- <td>true</td>
- <td>
- Turns on caching of Parquet schema metadata. Can speed up querying of static data.
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.parquet.compression.codec</code></td>
- <td>gzip</td>
- <td>
- Sets the compression codec use when writing Parquet files. Acceptable values include:
- uncompressed, snappy, gzip, lzo.
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.parquet.filterPushdown</code></td>
- <td>true</td>
- <td>Enables Parquet filter push-down optimization when set to true.</td>
-</tr>
-<tr>
- <td><code>spark.sql.hive.convertMetastoreParquet</code></td>
- <td>true</td>
- <td>
- When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of the built in
- support.
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.parquet.output.committer.class</code></td>
- <td><code>org.apache.parquet.hadoop.<br />ParquetOutputCommitter</code></td>
- <td>
- <p>
- The output committer class used by Parquet. The specified class needs to be a subclass of
- <code>org.apache.hadoop.<br />mapreduce.OutputCommitter</code>. Typically, it's also a
- subclass of <code>org.apache.parquet.hadoop.ParquetOutputCommitter</code>.
- </p>
- <p>
- <b>Note:</b>
- <ul>
- <li>
- This option is automatically ignored if <code>spark.speculation</code> is turned on.
- </li>
- <li>
- This option must be set via Hadoop <code>Configuration</code> rather than Spark
- <code>SQLConf</code>.
- </li>
- <li>
- This option overrides <code>spark.sql.sources.<br />outputCommitterClass</code>.
- </li>
- </ul>
- </p>
- <p>
- Spark SQL comes with a builtin
- <code>org.apache.spark.sql.<br />parquet.DirectParquetOutputCommitter</code>, which can be more
- efficient then the default Parquet output committer when writing data to S3.
- </p>
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.parquet.mergeSchema</code></td>
- <td><code>false</code></td>
- <td>
- <p>
- When true, the Parquet data source merges schemas collected from all data files, otherwise the
- schema is picked from the summary file or a random data file if no summary file is available.
- </p>
- </td>
-</tr>
-</table>
-
-<h2 id="json-datasets">JSON Datasets</h2>
-<div class="codetabs">
-
-<div data-lang="scala">
- <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using <code>SQLContext.read.json()</code> on either an RDD of String,
-or a JSON file.</p>
-
- <p>Note that the file that is offered as <em>a json file</em> is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.</p>
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sc is an existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="nc">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="c1">// A JSON dataset is pointed to by path.</span>
-<span class="c1">// The path can be either a single text file or a directory storing text files.</span>
-<span class="k">val</span> <span class="n">path</span> <span class="k">=</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span>
-<span class="k">val</span> <span class="n">people</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="o">(</span><span class="n">path</span><span class="o">)</span>
-
-<span class="c1">// The inferred schema can be visualized using the printSchema() method.</span>
-<span class="n">people</span><span class="o">.</span><span class="n">printSchema</span><span class="o">()</span>
-<span class="c1">// root</span>
-<span class="c1">// |-- age: integer (nullable = true)</span>
-<span class="c1">// |-- name: string (nullable = true)</span>
-
-<span class="c1">// Register this DataFrame as a table.</span>
-<span class="n">people</span><span class="o">.</span><span class="n">registerTempTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">)</span>
-
-<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
-<span class="k">val</span> <span class="n">teenagers</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">)</span>
-
-<span class="c1">// Alternatively, a DataFrame can be created for a JSON dataset represented by</span>
-<span class="c1">// an RDD[String] storing one JSON object per string.</span>
-<span class="k">val</span> <span class="n">anotherPeopleRDD</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span>
- <span class="s">&quot;&quot;&quot;{&quot;name&quot;:&quot;Yin&quot;,&quot;address&quot;:{&quot;city&quot;:&quot;Columbus&quot;,&quot;state&quot;:&quot;Ohio&quot;}}&quot;&quot;&quot;</span> <span class="o">::</span> <span class="nc">Nil</span><span class="o">)</span>
-<span class="k">val</span> <span class="n">anotherPeople</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="o">(</span><span class="n">anotherPeopleRDD</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
- <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using <code>SQLContext.read().json()</code> on either an RDD of String,
-or a JSON file.</p>
-
- <p>Note that the file that is offered as <em>a json file</em> is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.</p>
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// sc is an existing JavaSparkContext.</span>
-<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">SQLContext</span><span class="o">(</span><span class="n">sc</span><span class="o">);</span>
-
-<span class="c1">// A JSON dataset is pointed to by path.</span>
-<span class="c1">// The path can be either a single text file or a directory storing text files.</span>
-<span class="n">DataFrame</span> <span class="n">people</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="o">);</span>
-
-<span class="c1">// The inferred schema can be visualized using the printSchema() method.</span>
-<span class="n">people</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span>
-<span class="c1">// root</span>
-<span class="c1">// |-- age: integer (nullable = true)</span>
-<span class="c1">// |-- name: string (nullable = true)</span>
-
-<span class="c1">// Register this DataFrame as a table.</span>
-<span class="n">people</span><span class="o">.</span><span class="na">registerTempTable</span><span class="o">(</span><span class="s">&quot;people&quot;</span><span class="o">);</span>
-
-<span class="c1">// SQL statements can be run by using the sql methods provided by sqlContext.</span>
-<span class="n">DataFrame</span> <span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="o">);</span>
-
-<span class="c1">// Alternatively, a DataFrame can be created for a JSON dataset represented by</span>
-<span class="c1">// an RDD[String] storing one JSON object per string.</span>
-<span class="n">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">jsonData</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
- <span class="s">&quot;{\&quot;name\&quot;:\&quot;Yin\&quot;,\&quot;address\&quot;:{\&quot;city\&quot;:\&quot;Columbus\&quot;,\&quot;state\&quot;:\&quot;Ohio\&quot;}}&quot;</span><span class="o">);</span>
-<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">anotherPeopleRDD</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">jsonData</span><span class="o">);</span>
-<span class="n">DataFrame</span> <span class="n">anotherPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">json</span><span class="o">(</span><span class="n">anotherPeopleRDD</span><span class="o">);</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
- <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame.
-This conversion can be done using <code>SQLContext.read.json</code> on a JSON file.</p>
-
- <p>Note that the file that is offered as <em>a json file</em> is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sc is an existing SparkContext.</span>
-<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SQLContext</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-
-<span class="c"># A JSON dataset is pointed to by path.</span>
-<span class="c"># The path can be either a single text file or a directory storing text files.</span>
-<span class="n">people</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">json</span><span class="p">(</span><span class="s">&quot;examples/src/main/resources/people.json&quot;</span><span class="p">)</span>
-
-<span class="c"># The inferred schema can be visualized using the printSchema() method.</span>
-<span class="n">people</span><span class="o">.</span><span class="n">printSchema</span><span class="p">()</span>
-<span class="c"># root</span>
-<span class="c"># |-- age: integer (nullable = true)</span>
-<span class="c"># |-- name: string (nullable = true)</span>
-
-<span class="c"># Register this DataFrame as a table.</span>
-<span class="n">people</span><span class="o">.</span><span class="n">registerTempTable</span><span class="p">(</span><span class="s">&quot;people&quot;</span><span class="p">)</span>
-
-<span class="c"># SQL statements can be run by using the sql methods provided by `sqlContext`.</span>
-<span class="n">teenagers</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
-
-<span class="c"># Alternatively, a DataFrame can be created for a JSON dataset represented by</span>
-<span class="c"># an RDD[String] storing one JSON object per string.</span>
-<span class="n">anotherPeopleRDD</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
- <span class="s">&#39;{&quot;name&quot;:&quot;Yin&quot;,&quot;address&quot;:{&quot;city&quot;:&quot;Columbus&quot;,&quot;state&quot;:&quot;Ohio&quot;}}&#39;</span><span class="p">])</span>
-<span class="n">anotherPeople</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">jsonRDD</span><span class="p">(</span><span class="n">anotherPeopleRDD</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
- <p>Spark SQL can automatically infer the schema of a JSON dataset and load it as a DataFrame. using
-the <code>jsonFile</code> function, which loads data from a directory of JSON files where each line of the
-files is a JSON object.</p>
-
- <p>Note that the file that is offered as <em>a json file</em> is not a typical JSON file. Each
-line must contain a separate, self-contained valid JSON object. As a consequence,
-a regular multi-line JSON file will most often fail.</p>
-
- <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># sc is an existing SparkContext.</span>
-sqlContext <span class="o">&lt;-</span> sparkRSQL.init<span class="p">(</span>sc<span class="p">)</span>
-
-<span class="c1"># A JSON dataset is pointed to by path.</span>
-<span class="c1"># The path can be either a single text file or a directory storing text files.</span>
-path <span class="o">&lt;-</span> <span class="s">&quot;examples/src/main/resources/people.json&quot;</span>
-<span class="c1"># Create a DataFrame from the file(s) pointed to by path</span>
-people <span class="o">&lt;-</span> jsonFile<span class="p">(</span>sqlContext<span class="p">,</span> path<span class="p">)</span>
-
-<span class="c1"># The inferred schema can be visualized using the printSchema() method.</span>
-printSchema<span class="p">(</span>people<span class="p">)</span>
-<span class="c1"># root</span>
-<span class="c1"># |-- age: integer (nullable = true)</span>
-<span class="c1"># |-- name: string (nullable = true)</span>
-
-<span class="c1"># Register this DataFrame as a table.</span>
-registerTempTable<span class="p">(</span>people<span class="p">,</span> <span class="s">&quot;people&quot;</span><span class="p">)</span>
-
-<span class="c1"># SQL statements can be run by using the sql methods provided by `sqlContext`.</span>
-teenagers <span class="o">&lt;-</span> sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="sql">
-
- <div class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TEMPORARY</span> <span class="k">TABLE</span> <span class="n">jsonTable</span>
-<span class="k">USING</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">spark</span><span class="p">.</span><span class="k">sql</span><span class="p">.</span><span class="n">json</span>
-<span class="k">OPTIONS</span> <span class="p">(</span>
- <span class="n">path</span> <span class="ss">&quot;examples/src/main/resources/people.json&quot;</span>
-<span class="p">)</span>
-
-<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">jsonTable</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h2 id="hive-tables">Hive Tables</h2>
-
-<p>Spark SQL also supports reading and writing data stored in <a href="http://hive.apache.org/">Apache Hive</a>.
-However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
-Hive support is enabled by adding the <code>-Phive</code> and <code>-Phive-thriftserver</code> flags to Spark&#8217;s build.
-This command builds a new assembly jar that includes Hive. Note that this Hive assembly jar must also be present
-on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
-(SerDes) in order to access data stored in Hive.</p>
-
-<p>Configuration of Hive is done by placing your <code>hive-site.xml</code> file in <code>conf/</code>. Please note when running
-the query on a YARN cluster (<code>yarn-cluster</code> mode), the <code>datanucleus</code> jars under the <code>lib_managed/jars</code> directory
-and <code>hive-site.xml</code> under <code>conf/</code> directory need to be available on the driver and all executors launched by the
-YARN cluster. The convenient way to do this is adding them through the <code>--jars</code> option and <code>--file</code> option of the
-<code>spark-submit</code> command.</p>
-
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <p>When working with Hive one must construct a <code>HiveContext</code>, which inherits from <code>SQLContext</code>, and
-adds support for finding tables in the MetaStore and writing queries using HiveQL. Users who do
-not have an existing Hive deployment can still create a <code>HiveContext</code>. When not configured by the
-hive-site.xml, the context automatically creates <code>metastore_db</code> and <code>warehouse</code> in the current
-directory.</p>
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// sc is an existing SparkContext.</span>
-<span class="k">val</span> <span class="n">sqlContext</span> <span class="k">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="nc">HiveContext</span><span class="o">(</span><span class="n">sc</span><span class="o">)</span>
-
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="o">)</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="o">)</span>
-
-<span class="c1">// Queries are expressed in HiveQL</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="o">(</span><span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="o">).</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <p>When working with Hive one must construct a <code>HiveContext</code>, which inherits from <code>SQLContext</code>, and
-adds support for finding tables in the MetaStore and writing queries using HiveQL. In addition to
-the <code>sql</code> method a <code>HiveContext</code> also provides an <code>hql</code> method, which allows queries to be
-expressed in HiveQL.</p>
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// sc is an existing JavaSparkContext.</span>
-<span class="n">HiveContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="k">new</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">hive</span><span class="o">.</span><span class="na">HiveContext</span><span class="o">(</span><span class="n">sc</span><span class="o">.</span><span class="na">sc</span><span class="o">);</span>
-
-<span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="o">);</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="o">);</span>
-
-<span class="c1">// Queries are expressed in HiveQL.</span>
-<span class="n">Row</span><span class="o">[]</span> <span class="n">results</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="o">).</span><span class="na">collect</span><span class="o">();</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <p>When working with Hive one must construct a <code>HiveContext</code>, which inherits from <code>SQLContext</code>, and
-adds support for finding tables in the MetaStore and writing queries using HiveQL.</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="c"># sc is an existing SparkContext.</span>
-<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">HiveContext</span>
-<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">HiveContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
-
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="p">)</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="p">)</span>
-
-<span class="c"># Queries can be expressed in HiveQL.</span>
-<span class="n">results</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <p>When working with Hive one must construct a <code>HiveContext</code>, which inherits from <code>SQLContext</code>, and
-adds support for finding tables in the MetaStore and writing queries using HiveQL.</p>
-
- <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># sc is an existing SparkContext.</span>
-sqlContext <span class="o">&lt;-</span> sparkRHive.init<span class="p">(</span>sc<span class="p">)</span>
-
-sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="p">)</span>
-sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="p">)</span>
-
-<span class="c1"># Queries can be expressed in HiveQL.</span>
-results <span class="o">&lt;-</span> collect<span class="p">(</span>sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="p">))</span></code></pre></div>
-
- </div>
-</div>
-
-<h3 id="interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</h3>
-
-<p>One of the most important pieces of Spark SQL&#8217;s Hive support is interaction with Hive metastore,
-which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary
-build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below.
-Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL
-will compile against Hive 1.2.1 and use those classes for internal execution (serdes, UDFs, UDAFs, etc).</p>
-
-<p>The following options can be used to configure the version of Hive that is used to retrieve metadata:</p>
-
-<table class="table">
- <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
- <tr>
- <td><code>spark.sql.hive.metastore.version</code></td>
- <td><code>1.2.1</code></td>
- <td>
- Version of the Hive metastore. Available
- options are <code>0.12.0</code> through <code>1.2.1</code>.
- </td>
- </tr>
- <tr>
- <td><code>spark.sql.hive.metastore.jars</code></td>
- <td><code>builtin</code></td>
- <td>
- Location of the jars that should be used to instantiate the HiveMetastoreClient. This
- property can be one of three options:
- <ol>
- <li><code>builtin</code></li>
- Use Hive 1.2.1, which is bundled with the Spark assembly jar when <code>-Phive</code> is
- enabled. When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be
- either <code>1.2.1</code> or not defined.
- <li><code>maven</code></li>
- Use Hive jars of specified version downloaded from Maven repositories. This configuration
- is not generally recommended for production deployments.
- <li>A classpath in the standard format for the JVM. This classpath must include all of Hive
- and its dependencies, including the correct version of Hadoop. These jars only need to be
- present on the driver, but if you are running in yarn cluster mode then you must ensure
- they are packaged with you application.</li>
- </ol>
- </td>
- </tr>
- <tr>
- <td><code>spark.sql.hive.metastore.sharedPrefixes</code></td>
- <td><code>com.mysql.jdbc,<br />org.postgresql,<br />com.microsoft.sqlserver,<br />oracle.jdbc</code></td>
- <td>
- <p>
- A comma separated list of class prefixes that should be loaded using the classloader that is
- shared between Spark SQL and a specific version of Hive. An example of classes that should
- be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need
- to be shared are those that interact with classes that are already shared. For example,
- custom appenders that are used by log4j.
- </p>
- </td>
- </tr>
- <tr>
- <td><code>spark.sql.hive.metastore.barrierPrefixes</code></td>
- <td><code>(empty)</code></td>
- <td>
- <p>
- A comma separated list of class prefixes that should explicitly be reloaded for each version
- of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a
- prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>).
- </p>
- </td>
- </tr>
-</table>
-
-<h2 id="jdbc-to-other-databases">JDBC To Other Databases</h2>
-
-<p>Spark SQL also includes a data source that can read data from other databases using JDBC. This
-functionality should be preferred over using <a href="api/scala/index.html#org.apache.spark.rdd.JdbcRDD">JdbcRDD</a>.
-This is because the results are returned
-as a DataFrame and they can easily be processed in Spark SQL or joined with other data sources.
-The JDBC data source is also easier to use from Java or Python as it does not require the user to
-provide a ClassTag.
-(Note that this is different than the Spark SQL JDBC server, which allows other applications to
-run queries using Spark SQL).</p>
-
-<p>To get started you will need to include the JDBC driver for you particular database on the
-spark classpath. For example, to connect to postgres from the Spark Shell you would run the
-following command:</p>
-
-<div class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nv">SPARK_CLASSPATH</span><span class="o">=</span>postgresql-9.3-1102-jdbc41.jar bin/spark-shell</code></pre></div>
-
-<p>Tables from the remote database can be loaded as a DataFrame or Spark SQL Temporary table using
-the Data Sources API. The following options are supported:</p>
-
-<table class="table">
- <tr><th>Property Name</th><th>Meaning</th></tr>
- <tr>
- <td><code>url</code></td>
- <td>
- The JDBC URL to connect to.
- </td>
- </tr>
- <tr>
- <td><code>dbtable</code></td>
- <td>
- The JDBC table that should be read. Note that anything that is valid in a <code>FROM</code> clause of
- a SQL query can be used. For example, instead of a full table you could also use a
- subquery in parentheses.
- </td>
- </tr>
-
- <tr>
- <td><code>driver</code></td>
- <td>
- The class name of the JDBC driver needed to connect to this URL. This class will be loaded
- on the master and workers before running an JDBC commands to allow the driver to
- register itself with the JDBC subsystem.
- </td>
- </tr>
- <tr>
- <td><code>partitionColumn, lowerBound, upperBound, numPartitions</code></td>
- <td>
- These options must all be specified if any of them is specified. They describe how to
- partition the table when reading in parallel from multiple workers.
- <code>partitionColumn</code> must be a numeric column from the table in question. Notice
- that <code>lowerBound</code> and <code>upperBound</code> are just used to decide the
- partition stride, not for filtering the rows in table. So all rows in the table will be
- partitioned and returned.
- </td>
- </tr>
-</table>
-
-<div class="codetabs">
-
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">jdbcDF</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="o">(</span><span class="s">&quot;jdbc&quot;</span><span class="o">).</span><span class="n">options</span><span class="o">(</span>
- <span class="nc">Map</span><span class="o">(</span><span class="s">&quot;url&quot;</span> <span class="o">-&gt;</span> <span class="s">&quot;jdbc:postgresql:dbserver&quot;</span><span class="o">,</span>
- <span class="s">&quot;dbtable&quot;</span> <span class="o">-&gt;</span> <span class="s">&quot;schema.tablename&quot;</span><span class="o">)).</span><span class="n">load</span><span class="o">()</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">Map</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;</span> <span class="n">options</span> <span class="o">=</span> <span class="k">new</span> <span class="n">HashMap</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">&gt;();</span>
-<span class="n">options</span><span class="o">.</span><span class="na">put</span><span class="o">(</span><span class="s">&quot;url&quot;</span><span class="o">,</span> <span class="s">&quot;jdbc:postgresql:dbserver&quot;</span><span class="o">);</span>
-<span class="n">options</span><span class="o">.</span><span class="na">put</span><span class="o">(</span><span class="s">&quot;dbtable&quot;</span><span class="o">,</span> <span class="s">&quot;schema.tablename&quot;</span><span class="o">);</span>
-
-<span class="n">DataFrame</span> <span class="n">jdbcDF</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;jdbc&quot;</span><span class="o">).</span> <span class="n">options</span><span class="o">(</span><span class="n">options</span><span class="o">).</span><span class="na">load</span><span class="o">();</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s">&#39;jdbc&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="n">url</span><span class="o">=</span><span class="s">&#39;jdbc:postgresql:dbserver&#39;</span><span class="p">,</span> <span class="n">dbtable</span><span class="o">=</span><span class="s">&#39;schema.tablename&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span></code></pre></div>
-
- </div>
-
-<div data-lang="r">
-
- <div class="highlight"><pre><code class="language-r" data-lang="r">df <span class="o">&lt;-</span> loadDF<span class="p">(</span>sqlContext<span class="p">,</span> <span class="kn">source</span><span class="o">=</span><span class="s">&quot;jdbc&quot;</span><span class="p">,</span> url<span class="o">=</span><span class="s">&quot;jdbc:postgresql:dbserver&quot;</span><span class="p">,</span> dbtable<span class="o">=</span><span class="s">&quot;schema.tablename&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="sql">
-
- <div class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">TEMPORARY</span> <span class="k">TABLE</span> <span class="n">jdbcTable</span>
-<span class="k">USING</span> <span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">spark</span><span class="p">.</span><span class="k">sql</span><span class="p">.</span><span class="n">jdbc</span>
-<span class="k">OPTIONS</span> <span class="p">(</span>
- <span class="n">url</span> <span class="ss">&quot;jdbc:postgresql:dbserver&quot;</span><span class="p">,</span>
- <span class="n">dbtable</span> <span class="ss">&quot;schema.tablename&quot;</span>
-<span class="p">)</span></code></pre></div>
-
- </div>
-</div>
-
-<h2 id="troubleshooting">Troubleshooting</h2>
-
-<ul>
- <li>The JDBC driver class must be visible to the primordial class loader on the client session and on all executors. This is because Java&#8217;s DriverManager class does a security check that results in it ignoring all drivers not visible to the primordial class loader when one goes to open a connection. One convenient way to do this is to modify compute_classpath.sh on all worker nodes to include your driver JARs.</li>
- <li>Some databases, such as H2, convert all names to upper case. You&#8217;ll need to use upper case to refer to those names in Spark SQL.</li>
-</ul>
-
-<h1 id="performance-tuning">Performance Tuning</h1>
-
-<p>For some workloads it is possible to improve performance by either caching data in memory, or by
-turning on some experimental options.</p>
-
-<h2 id="caching-data-in-memory">Caching Data In Memory</h2>
-
-<p>Spark SQL can cache tables using an in-memory columnar format by calling <code>sqlContext.cacheTable("tableName")</code> or <code>dataFrame.cache()</code>.
-Then Spark SQL will scan only required columns and will automatically tune compression to minimize
-memory usage and GC pressure. You can call <code>sqlContext.uncacheTable("tableName")</code> to remove the table from memory.</p>
-
-<p>Configuration of in-memory caching can be done using the <code>setConf</code> method on <code>SQLContext</code> or by running
-<code>SET key=value</code> commands using SQL.</p>
-
-<table class="table">
-<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
-<tr>
- <td><code>spark.sql.inMemoryColumnarStorage.compressed</code></td>
- <td>true</td>
- <td>
- When set to true Spark SQL will automatically select a compression codec for each column based
- on statistics of the data.
- </td>
-</tr>
-<tr>
- <td><code>spark.sql.inMemoryColumnarStorage.batchSize</code></td>
- <td>10000</td>
- <td>
- Controls the size of batches for columnar caching. Larger batch sizes can improve memory utilization
- and compression, but risk OOMs when caching data.
- </td>
-</tr>
-
-</table>
-
-<h2 id="other-configuration-options">Other Configuration Options</h2>
-
-<p>The following options can also be used to tune the performance of query execution. It is possible
-that these options will be deprecated in future release as more optimizations are performed automatically.</p>
-
-<table class="table">
- <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
- <tr>
- <td><code>spark.sql.autoBroadcastJoinThreshold</code></td>
- <td>10485760 (10 MB)</td>
- <td>
- Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when
- performing a join. By setting this value to -1 broadcasting can be disabled. Note that currently
- statistics are only supported for Hive Metastore tables where the command
- <code>ANALYZE TABLE &lt;tableName&gt; COMPUTE STATISTICS noscan</code> has been run.
- </td>
- </tr>
- <tr>
- <td><code>spark.sql.tungsten.enabled</code></td>
- <td>true</td>
- <td>
- When true, use the optimized Tungsten physical execution backend which explicitly manages memory
- and dynamically generates bytecode for expression evaluation.
- </td>
- </tr>
- <tr>
- <td><code>spark.sql.shuffle.partitions</code></td>
- <td>200</td>
- <td>
- Configures the number of partitions to use when shuffling data for joins or aggregations.
- </td>
- </tr>
- <tr>
- <td><code>spark.sql.planner.externalSort</code></td>
- <td>true</td>
- <td>
- When true, performs sorts spilling to disk as needed otherwise sort each partition in memory.
- </td>
- </tr>
-</table>
-
-<h1 id="distributed-sql-engine">Distributed SQL Engine</h1>
-
-<p>Spark SQL can also act as a distributed query engine using its JDBC/ODBC or command-line interface.
-In this mode, end-users or applications can interact with Spark SQL directly to run SQL queries,
-without the need to write any code.</p>
-
-<h2 id="running-the-thrift-jdbcodbc-server">Running the Thrift JDBC/ODBC server</h2>
-
-<p>The Thrift JDBC/ODBC server implemented here corresponds to the <a href="https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2"><code>HiveServer2</code></a>
-in Hive 0.13. You can test the JDBC server with the beeline script that comes with either Spark or Hive 0.13.</p>
-
-<p>To start the JDBC/ODBC server, run the following in the Spark directory:</p>
-
-<pre><code>./sbin/start-thriftserver.sh
-</code></pre>
-
-<p>This script accepts all <code>bin/spark-submit</code> command line options, plus a <code>--hiveconf</code> option to
-specify Hive properties. You may run <code>./sbin/start-thriftserver.sh --help</code> for a complete list of
-all available options. By default, the server listens on localhost:10000. You may override this
-behaviour via either environment variables, i.e.:</p>
-
-<div class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">HIVE_SERVER2_THRIFT_PORT</span><span class="o">=</span>&lt;listening-port&gt;
-<span class="nb">export </span><span class="nv">HIVE_SERVER2_THRIFT_BIND_HOST</span><span class="o">=</span>&lt;listening-host&gt;
-./sbin/start-thriftserver.sh <span class="se">\</span>
- --master &lt;master-uri&gt; <span class="se">\</span>
- ...</code></pre></div>
-
-<p>or system properties:</p>
-
-<div class="highlight"><pre><code class="language-bash" data-lang="bash">./sbin/start-thriftserver.sh <span class="se">\</span>
- --hiveconf hive.server2.thrift.port<span class="o">=</span>&lt;listening-port&gt; <span class="se">\</span>
- --hiveconf hive.server2.thrift.bind.host<span class="o">=</span>&lt;listening-host&gt; <span class="se">\</span>
- --master &lt;master-uri&gt;
- ...</code></pre></div>
-
-<p>Now you can use beeline to test the Thrift JDBC/ODBC server:</p>
-
-<pre><code>./bin/beeline
-</code></pre>
-
-<p>Connect to the JDBC/ODBC server in beeline with:</p>
-
-<pre><code>beeline&gt; !connect jdbc:hive2://localhost:10000
-</code></pre>
-
-<p>Beeline will ask you for a username and password. In non-secure mode, simply enter the username on
-your machine and a blank password. For secure mode, please follow the instructions given in the
-<a href="https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients">beeline documentation</a>.</p>
-
-<p>Configuration of Hive is done by placing your <code>hive-site.xml</code> file in <code>conf/</code>.</p>
-
-<p>You may also use the beeline script that comes with Hive.</p>
-
-<p>Thrift JDBC server also supports sending thrift RPC messages over HTTP transport.
-Use the following setting to enable HTTP mode as system property or in <code>hive-site.xml</code> file in <code>conf/</code>:</p>
-
-<pre><code>hive.server2.transport.mode - Set this to value: http
-hive.server2.thrift.http.port - HTTP port number fo listen on; default is 10001
-hive.server2.http.endpoint - HTTP endpoint; default is cliservice
-</code></pre>
-
-<p>To test, use beeline to connect to the JDBC/ODBC server in http mode with:</p>
-
-<pre><code>beeline&gt; !connect jdbc:hive2://&lt;host&gt;:&lt;port&gt;/&lt;database&gt;?hive.server2.transport.mode=http;hive.server2.thrift.http.path=&lt;http_endpoint&gt;
-</code></pre>
-
-<h2 id="running-the-spark-sql-cli">Running the Spark SQL CLI</h2>
-
-<p>The Spark SQL CLI is a convenient tool to run the Hive metastore service in local mode and execute
-queries input from the command line. Note that the Spark SQL CLI cannot talk to the Thrift JDBC server.</p>
-
-<p>To start the Spark SQL CLI, run the following in the Spark directory:</p>
-
-<pre><code>./bin/spark-sql
-</code></pre>
-
-<p>Configuration of Hive is done by placing your <code>hive-site.xml</code> file in <code>conf/</code>.
-You may run <code>./bin/spark-sql --help</code> for a complete list of all available
-options.</p>
-
-<h1 id="migration-guide">Migration Guide</h1>
-
-<h2 id="upgrading-from-spark-sql-14-to-15">Upgrading From Spark SQL 1.4 to 1.5</h2>
-
-<ul>
- <li>Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with
-code generation for expression evaluation. These features can both be disabled by setting
-<code>spark.sql.tungsten.enabled</code> to `false.</li>
- <li>Parquet schema merging is no longer enabled by default. It can be re-enabled by setting
-<code>spark.sql.parquet.mergeSchema</code> to <code>true</code>.</li>
- <li>Resolution of strings to columns in python now supports using dots (<code>.</code>) to qualify the column or
-access nested values. For example <code>df['table.column.nestedField']</code>. However, this means that if
-your column name contains any dots you must now escape them using backticks (e.g., <code>table.`column.with.dots`.nested</code>). </li>
- <li>In-memory columnar storage partition pruning is on by default. It can be disabled by setting
-<code>spark.sql.inMemoryColumnarStorage.partitionPruning</code> to <code>false</code>.</li>
- <li>Unlimited precision decimal columns are no longer supported, instead Spark SQL enforces a maximum
-precision of 38. When inferring schema from <code>BigDecimal</code> objects, a precision of (38, 18) is now
-used. When no precision is specified in DDL then the default remains <code>Decimal(10, 0)</code>.</li>
- <li>Timestamps are now stored at a precision of 1us, rather than 1ns</li>
- <li>In the <code>sql</code> dialect, floating point numbers are now parsed as decimal. HiveQL parsing remains
-unchanged.</li>
- <li>The canonical name of SQL/DataFrame functions are now lower case (e.g. sum vs SUM).</li>
- <li>It has been determined that using the DirectOutputCommitter when speculation is enabled is unsafe
-and thus this output committer will not be used when speculation is on, independent of configuration.</li>
- <li>JSON data source will not automatically load new files that are created by other applications
-(i.e. files that are not inserted to the dataset through Spark SQL).
-For a JSON persistent table (i.e. the metadata of the table is stored in Hive Metastore),
-users can use <code>REFRESH TABLE</code> SQL command or <code>HiveContext</code>&#8217;s <code>refreshTable</code> method
-to include those new files to the table. For a DataFrame representing a JSON dataset, users need to recreate
-the DataFrame and the new DataFrame will include new files.</li>
-</ul>
-
-<h2 id="upgrading-from-spark-sql-13-to-14">Upgrading from Spark SQL 1.3 to 1.4</h2>
-
-<h4 id="dataframe-data-readerwriter-interface">DataFrame data reader/writer interface</h4>
-
-<p>Based on user feedback, we created a new, more fluid API for reading data in (<code>SQLContext.read</code>)
-and writing data out (<code>DataFrame.write</code>),
-and deprecated the old APIs (e.g. <code>SQLContext.parquetFile</code>, <code>SQLContext.jsonFile</code>).</p>
-
-<p>See the API docs for <code>SQLContext.read</code> (
- <a href="api/scala/index.html#org.apache.spark.sql.SQLContext@read:DataFrameReader">Scala</a>,
- <a href="api/java/org/apache/spark/sql/SQLContext.html#read()">Java</a>,
- <a href="api/python/pyspark.sql.html#pyspark.sql.SQLContext.read">Python</a>
-) and <code>DataFrame.write</code> (
- <a href="api/scala/index.html#org.apache.spark.sql.DataFrame@write:DataFrameWriter">Scala</a>,
- <a href="api/java/org/apache/spark/sql/DataFrame.html#write()">Java</a>,
- <a href="api/python/pyspark.sql.html#pyspark.sql.DataFrame.write">Python</a>
-) more information.</p>
-
-<h4 id="dataframegroupby-retains-grouping-columns">DataFrame.groupBy retains grouping columns</h4>
-
-<p>Based on user feedback, we changed the default behavior of <code>DataFrame.groupBy().agg()</code> to retain the
-grouping columns in the resulting <code>DataFrame</code>. To keep the behavior in 1.3, set <code>spark.sql.retainGroupColumns</code> to <code>false</code>.</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// In 1.3.x, in order for the grouping column &quot;department&quot; to show up,</span>
-<span class="c1">// it must be included explicitly as part of the agg function call.</span>
-<span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="o">(</span><span class="s">&quot;department&quot;</span><span class="o">).</span><span class="n">agg</span><span class="o">(</span><span class="n">$</span><span class="s">&quot;department&quot;</span><span class="o">,</span> <span class="n">max</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">),</span> <span class="n">sum</span><span class="o">(</span><span class="s">&quot;expense&quot;</span><span class="o">))</span>
-
-<span class="c1">// In 1.4+, grouping column &quot;department&quot; is included automatically.</span>
-<span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="o">(</span><span class="s">&quot;department&quot;</span><span class="o">).</span><span class="n">agg</span><span class="o">(</span><span class="n">max</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">),</span> <span class="n">sum</span><span class="o">(</span><span class="s">&quot;expense&quot;</span><span class="o">))</span>
-
-<span class="c1">// Revert to 1.3 behavior (not retaining grouping column) by:</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">setConf</span><span class="o">(</span><span class="s">&quot;spark.sql.retainGroupColumns&quot;</span><span class="o">,</span> <span class="s">&quot;false&quot;</span><span class="o">)</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// In 1.3.x, in order for the grouping column &quot;department&quot; to show up,</span>
-<span class="c1">// it must be included explicitly as part of the agg function call.</span>
-<span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">&quot;department&quot;</span><span class="o">).</span><span class="na">agg</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">&quot;department&quot;</span><span class="o">),</span> <span class="n">max</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">),</span> <span class="n">sum</span><span class="o">(</span><span class="s">&quot;expense&quot;</span><span class="o">));</span>
-
-<span class="c1">// In 1.4+, grouping column &quot;department&quot; is included automatically.</span>
-<span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">&quot;department&quot;</span><span class="o">).</span><span class="na">agg</span><span class="o">(</span><span class="n">max</span><span class="o">(</span><span class="s">&quot;age&quot;</span><span class="o">),</span> <span class="n">sum</span><span class="o">(</span><span class="s">&quot;expense&quot;</span><span class="o">));</span>
-
-<span class="c1">// Revert to 1.3 behavior (not retaining grouping column) by:</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="na">setConf</span><span class="o">(</span><span class="s">&quot;spark.sql.retainGroupColumns&quot;</span><span class="o">,</span> <span class="s">&quot;false&quot;</span><span class="o">);</span></code></pre></div>
-
- </div>
-
-<div data-lang="python">
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">as</span> <span class="nn">func</span>
-
-<span class="c"># In 1.3.x, in order for the grouping column &quot;department&quot; to show up,</span>
-<span class="c"># it must be included explicitly as part of the agg function call.</span>
-<span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">&quot;department&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="s">&quot;department&quot;</span><span class="p">),</span> <span class="n">func</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="s">&quot;age&quot;</span><span class="p">),</span> <span class="n">func</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="s">&quot;expense&quot;</span><span class="p">))</span>
-
-<span class="c"># In 1.4+, grouping column &quot;department&quot; is included automatically.</span>
-<span class="n">df</span><span class="o">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">&quot;department&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">func</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="s">&quot;age&quot;</span><span class="p">),</span> <span class="n">func</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="s">&quot;expense&quot;</span><span class="p">))</span>
-
-<span class="c"># Revert to 1.3.x behavior (not retaining grouping column) by:</span>
-<span class="n">sqlContext</span><span class="o">.</span><span class="n">setConf</span><span class="p">(</span><span class="s">&quot;spark.sql.retainGroupColumns&quot;</span><span class="p">,</span> <span class="s">&quot;false&quot;</span><span class="p">)</span></code></pre></div>
-
- </div>
-
-</div>
-
-<h2 id="upgrading-from-spark-sql-10-12-to-13">Upgrading from Spark SQL 1.0-1.2 to 1.3</h2>
-
-<p>In Spark 1.3 we removed the &#8220;Alpha&#8221; label from Spark SQL and as part of this did a cleanup of the
-available APIs. From Spark 1.3 onwards, Spark SQL will provide binary compatibility with other
-releases in the 1.X series. This compatibility guarantee excludes APIs that are explicitly marked
-as unstable (i.e., DeveloperAPI or Experimental).</p>
-
-<h4 id="rename-of-schemardd-to-dataframe">Rename of SchemaRDD to DataFrame</h4>
-
-<p>The largest change that users will notice when upgrading to Spark SQL 1.3 is that <code>SchemaRDD</code> has
-been renamed to <code>DataFrame</code>. This is primarily because DataFrames no longer inherit from RDD
-directly, but instead provide most of the functionality that RDDs provide though their own
-implementation. DataFrames can still be converted to RDDs by calling the <code>.rdd</code> method.</p>
-
-<p>In Scala there is a type alias from <code>SchemaRDD</code> to <code>DataFrame</code> to provide source compatibility for
-some use cases. It is still recommended that users update their code to use <code>DataFrame</code> instead.
-Java and Python users will need to update their code.</p>
-
-<h4 id="unification-of-the-java-and-scala-apis">Unification of the Java and Scala APIs</h4>
-
-<p>Prior to Spark 1.3 there were separate Java compatible classes (<code>JavaSQLContext</code> and <code>JavaSchemaRDD</code>)
-that mirrored the Scala API. In Spark 1.3 the Java API and Scala API have been unified. Users
-of either language should use <code>SQLContext</code> and <code>DataFrame</code>. In general theses classes try to
-use types that are usable from both languages (i.e. <code>Array</code> instead of language specific collections).
-In some cases where no common type exists (e.g., for passing in closures or Maps) function overloading
-is used instead.</p>
-
-<p>Additionally the Java specific types API has been removed. Users of both Scala and Java should
-use the classes present in <code>org.apache.spark.sql.types</code> to describe schema programmatically.</p>
-
-<h4 id="isolation-of-implicit-conversions-and-removal-of-dsl-package-scala-only">Isolation of Implicit Conversions and Removal of dsl Package (Scala-only)</h4>
-
-<p>Many of the code examples prior to Spark 1.3 started with <code>import sqlContext._</code>, which brought
-all of the functions from sqlContext into scope. In Spark 1.3 we have isolated the implicit
-conversions for converting <code>RDD</code>s into <code>DataFrame</code>s into an object inside of the <code>SQLContext</code>.
-Users should now write <code>import sqlContext.implicits._</code>.</p>
-
-<p>Additionally, the implicit conversions now only augment RDDs that are composed of <code>Product</code>s (i.e.,
-case classes or tuples) with a method <code>toDF</code>, instead of applying automatically.</p>
-
-<p>When using function inside of the DSL (now replaced with the <code>DataFrame</code> API) users used to import
-<code>org.apache.spark.sql.catalyst.dsl</code>. Instead the public dataframe functions API should be used:
-<code>import org.apache.spark.sql.functions._</code>.</p>
-
-<h4 id="removal-of-the-type-aliases-in-orgapachesparksql-for-datatype-scala-only">Removal of the type aliases in org.apache.spark.sql for DataType (Scala-only)</h4>
-
-<p>Spark 1.3 removes the type aliases that were present in the base sql package for <code>DataType</code>. Users
-should instead import the classes in <code>org.apache.spark.sql.types</code></p>
-
-<h4 id="udf-registration-moved-to-sqlcontextudf-java--scala">UDF Registration Moved to <code>sqlContext.udf</code> (Java &amp; Scala)</h4>
-
-<p>Functions that are used to register UDFs, either for use in the DataFrame DSL or SQL, have been
-moved into the udf object in <code>SQLContext</code>.</p>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="n">sqlContext</span><span class="o">.</span><span class="n">udf</span><span class="o">.</span><span class="n">register</span><span class="o">(</span><span class="s">&quot;strLen&quot;</span><span class="o">,</span> <span class="o">(</span><span class="n">s</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="n">s</span><span class="o">.</span><span class="n">length</span><span class="o">())</span></code></pre></div>
-
- </div>
-
-<div data-lang="java">
-
- <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">sqlContext</span><span class="o">.</span><span class="na">udf</span><span class="o">().</span><span class="na">register</span><span class="o">(</span><span class="s">&quot;strLen&quot;</span><span class="o">,</span> <span class="o">(</span><span class="n">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">-&gt;</span> <span class="n">s</span><span class="o">.</span><span class="na">length</span><span class="o">(),</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">);</span></code></pre></div>
-
- </div>
-
-</div>
-
-<p>Python UDF registration is unchanged.</p>
-
-<h4 id="python-datatypes-no-longer-singletons">Python DataTypes No Longer Singletons</h4>
-
-<p>When using DataTypes in Python you will need to construct them (i.e. <code>StringType()</code>) instead of
-referencing a singleton.</p>
-
-<h2 id="migration-guide-for-shark-users">Migration Guide for Shark Users</h2>
-
-<h3 id="scheduling">Scheduling</h3>
-<p>To set a <a href="job-scheduling.html#fair-scheduler-pools">Fair Scheduler</a> pool for a JDBC client session,
-users can set the <code>spark.sql.thriftserver.scheduler.pool</code> variable:</p>
-
-<pre><code>SET spark.sql.thriftserver.scheduler.pool=accounting;
-</code></pre>
-
-<h3 id="reducer-number">Reducer number</h3>
-
-<p>In Shark, default reducer number is 1 and is controlled by the property <code>mapred.reduce.tasks</code>. Spark
-SQL deprecates this property in favor of <code>spark.sql.shuffle.partitions</code>, whose default value
-is 200. Users may customize this property via <code>SET</code>:</p>
-
-<pre><code>SET spark.sql.shuffle.partitions=10;
-SELECT page, count(*) c
-FROM logs_last_month_cached
-GROUP BY page ORDER BY c DESC LIMIT 10;
-</code></pre>
-
-<p>You may also put this property in <code>hive-site.xml</code> to override the default value.</p>
-
-<p>For now, the <code>mapred.reduce.tasks</code> property is still recognized, and is converted to
-<code>spark.sql.shuffle.partitions</code> automatically.</p>
-
-<h3 id="caching">Caching</h3>
-
-<p>The <code>shark.cache</code> table property no longer exists, and tables whose name end with <code>_cached</code> are no
-longer automatically cached. Instead, we provide <code>CACHE TABLE</code> and <code>UNCACHE TABLE</code> statements to
-let user control table caching explicitly:</p>
-
-<pre><code>CACHE TABLE logs_last_month;
-UNCACHE TABLE logs_last_month;
-</code></pre>
-
-<p><strong>NOTE:</strong> <code>CACHE TABLE tbl</code> is now <strong>eager</strong> by default not <strong>lazy</strong>. Don’t need to trigger cache materialization manually anymore.</p>
-
-<p>Spark SQL newly introduced a statement to let user control table caching whether or not lazy since Spark 1.2.0:</p>
-
-<pre><code>CACHE [LAZY] TABLE [AS SELECT] ...
-</code></pre>
-
-<p>Several caching related features are not supported yet:</p>
-
-<ul>
- <li>User defined partition level cache eviction policy</li>
- <li>RDD reloading</li>
- <li>In-memory cache write through policy</li>
-</ul>
-
-<h2 id="compatibility-with-apache-hive">Compatibility with Apache Hive</h2>
-
-<p>Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs. Currently Spark
-SQL is based on Hive 0.12.0 and 0.13.1.</p>
-
-<h4 id="deploying-in-existing-hive-warehouses">Deploying in Existing Hive Warehouses</h4>
-
-<p>The Spark SQL Thrift JDBC server is designed to be &#8220;out of the box&#8221; compatible with existing Hive
-installations. You do not need to modify your existing Hive Metastore or change the data placement
-or partitioning of your tables.</p>
-
-<h3 id="supported-hive-features">Supported Hive Features</h3>
-
-<p>Spark SQL supports the vast majority of Hive features, such as:</p>
-
-<ul>
- <li>Hive query statements, including:
- <ul>
- <li><code>SELECT</code></li>
- <li><code>GROUP BY</code></li>
- <li><code>ORDER BY</code></li>
- <li><code>CLUSTER BY</code></li>
- <li><code>SORT BY</code></li>
- </ul>
- </li>
- <li>All Hive operators, including:
- <ul>
- <li>Relational operators (<code>=</code>, <code>⇔</code>, <code>==</code>, <code>&lt;&gt;</code>, <code>&lt;</code>, <code>&gt;</code>, <code>&gt;=</code>, <code>&lt;=</code>, etc)</li>
- <li>Arithmetic operators (<code>+</code>, <code>-</code>, <code>*</code>, <code>/</code>, <code>%</code>, etc)</li>
- <li>Logical operators (<code>AND</code>, <code>&amp;&amp;</code>, <code>OR</code>, <code>||</code>, etc)</li>
- <li>Complex type constructors</li>
- <li>Mathematical functions (<code>sign</code>, <code>ln</code>, <code>cos</code>, etc)</li>
- <li>String functions (<code>instr</code>, <code>length</code>, <code>printf</code>, etc)</li>
- </ul>
- </li>
- <li>User defined functions (UDF)</li>
- <li>User defined aggregation functions (UDAF)</li>
- <li>User defined serialization formats (SerDes)</li>
- <li>Window functions</li>
- <li>Joins
- <ul>
- <li><code>JOIN</code></li>
- <li><code>{LEFT|RIGHT|FULL} OUTER JOIN</code></li>
- <li><code>LEFT SEMI JOIN</code></li>
- <li><code>CROSS JOIN</code></li>
- </ul>
- </li>
- <li>Unions</li>
- <li>Sub-queries
- <ul>
- <li><code>SELECT col FROM ( SELECT a + b AS col from t1) t2</code></li>
- </ul>
- </li>
- <li>Sampling</li>
- <li>Explain</li>
- <li>Partitioned tables including dynamic partition insertion</li>
- <li>View</li>
- <li>All Hive DDL Functions, including:
- <ul>
- <li><code>CREATE TABLE</code></li>
- <li><code>CREATE TABLE AS SELECT</code></li>
- <li><code>ALTER TABLE</code></li>
- </ul>
- </li>
- <li>Most Hive Data types, including:
- <ul>
- <li><code>TINYINT</code></li>
- <li><code>SMALLINT</code></li>
- <li><code>INT</code></li>
- <li><code>BIGINT</code></li>
- <li><code>BOOLEAN</code></li>
- <li><code>FLOAT</code></li>
- <li><code>DOUBLE</code></li>
- <li><code>STRING</code></li>
- <li><code>BINARY</code></li>
- <li><code>TIMESTAMP</code></li>
- <li><code>DATE</code></li>
- <li><code>ARRAY&lt;&gt;</code></li>
- <li><code>MAP&lt;&gt;</code></li>
- <li><code>STRUCT&lt;&gt;</code></li>
- </ul>
- </li>
-</ul>
-
-<h3 id="unsupported-hive-functionality">Unsupported Hive Functionality</h3>
-
-<p>Below is a list of Hive features that we don&#8217;t support yet. Most of these features are rarely used
-in Hive deployments.</p>
-
-<p><strong>Major Hive Features</strong></p>
-
-<ul>
- <li>Tables with buckets: bucket is the hash partitioning within a Hive table partition. Spark SQL
-doesn&#8217;t support buckets yet.</li>
-</ul>
-
-<p><strong>Esoteric Hive Features</strong></p>
-
-<ul>
- <li><code>UNION</code> type</li>
- <li>Unique join</li>
- <li>Column statistics collecting: Spark SQL does not piggyback scans to collect column statistics at
-the moment and only supports populating the sizeInBytes field of the hive metastore.</li>
-</ul>
-
-<p><strong>Hive Input/Output Formats</strong></p>
-
-<ul>
- <li>File format for CLI: For results showing back to the CLI, Spark SQL only supports TextOutputFormat.</li>
- <li>Hadoop archive</li>
-</ul>
-
-<p><strong>Hive Optimizations</strong></p>
-
-<p>A handful of Hive optimizations are not yet included in Spark. Some of these (such as indexes) are
-less important due to Spark SQL&#8217;s in-memory computational model. Others are slotted for future
-releases of Spark SQL.</p>
-
-<ul>
- <li>Block level bitmap indexes and virtual columns (used to build indexes)</li>
- <li>Automatically determine the number of reducers for joins and groupbys: Currently in Spark SQL, you
-need to control the degree of parallelism post-shuffle using &#8220;<code>SET spark.sql.shuffle.partitions=[num_tasks];</code>&#8221;.</li>
- <li>Meta-data only query: For queries that can be answered by using only meta data, Spark SQL still
-launches tasks to compute the result.</li>
- <li>Skew data flag: Spark SQL does not follow the skew data flags in Hive.</li>
- <li><code>STREAMTABLE</code> hint in join: Spark SQL does not follow the <code>STREAMTABLE</code> hint.</li>
- <li>Merge multiple small files for query results: if the result output contains multiple small files,
-Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS
-metadata. Spark SQL does not support that.</li>
-</ul>
-
-<h1 id="reference">Reference</h1>
-
-<h2 id="data-types">Data Types</h2>
-
-<p>Spark SQL and DataFrames support the following data types:</p>
-
-<ul>
- <li>Numeric types
- <ul>
- <li><code>ByteType</code>: Represents 1-byte signed integer numbers.
- The range of numbers is from <code>-128</code> to <code>127</code>.</li>
- <li><code>ShortType</code>: Represents 2-byte signed integer numbers.
- The range of numbers is from <code>-32768</code> to <code>32767</code>.</li>
- <li><code>IntegerType</code>: Represents 4-byte signed integer numbers.
- The range of numbers is from <code>-2147483648</code> to <code>2147483647</code>.</li>
- <li><code>LongType</code>: Represents 8-byte signed integer numbers.
- The range of numbers is from <code>-9223372036854775808</code> to <code>9223372036854775807</code>.</li>
- <li><code>FloatType</code>: Represents 4-byte single-precision floating point numbers.</li>
- <li><code>DoubleType</code>: Represents 8-byte double-precision floating point numbers.</li>
- <li><code>DecimalType</code>: Represents arbitrary-precision signed decimal numbers. Backed internally by <code>java.math.BigDecimal</code>. A <code>BigDecimal</code> consists of an arbitrary precision integer unscaled value and a 32-bit integer scale.</li>
- </ul>
- </li>
- <li>String type
- <ul>
- <li><code>StringType</code>: Represents character string values.</li>
- </ul>
- </li>
- <li>Binary type
- <ul>
- <li><code>BinaryType</code>: Represents byte sequence values.</li>
- </ul>
- </li>
- <li>Boolean type
- <ul>
- <li><code>BooleanType</code>: Represents boolean values.</li>
- </ul>
- </li>
- <li>Datetime type
- <ul>
- <li><code>TimestampType</code>: Represents values comprising values of fields year, month, day,
- hour, minute, and second.</li>
- <li><code>DateType</code>: Represents values comprising values of fields year, month, day.</li>
- </ul>
- </li>
- <li>Complex types
- <ul>
- <li><code>ArrayType(elementType, containsNull)</code>: Represents values comprising a sequence of
- elements with the type of <code>elementType</code>. <code>containsNull</code> is used to indicate if
- elements in a <code>ArrayType</code> value can have <code>null</code> values.</li>
- <li><code>MapType(keyType, valueType, valueContainsNull)</code>:
- Represents values comprising a set of key-value pairs. The data type of keys are
- described by <code>keyType</code> and the data type of values are described by <code>valueType</code>.
- For a <code>MapType</code> value, keys are not allowed to have <code>null</code> values. <code>valueContainsNull</code>
- is used to indicate if values of a <code>MapType</code> value can have <code>null</code> values.</li>
- <li><code>StructType(fields)</code>: Represents values with the structure described by
- a sequence of <code>StructField</code>s (<code>fields</code>).
- <ul>
- <li><code>StructField(name, dataType, nullable)</code>: Represents a field in a <code>StructType</code>.
- The name of a field is indicated by <code>name</code>. The data type of a field is indicated
- by <code>dataType</code>. <code>nullable</code> is used to indicate if values of this fields can have
- <code>null</code> values.</li>
- </ul>
- </li>
- </ul>
- </li>
-</ul>
-
-<div class="codetabs">
-<div data-lang="scala">
-
- <p>All data types of Spark SQL are located in the package <code>org.apache.spark.sql.types</code>.
-You can access them by doing</p>
-
- <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.types._</span></code></pre></div>
-
- <table class="table">
-<tr>
- <th style="width:20%">Data type</th>
- <th style="width:40%">Value type in Scala</th>
- <th>API to access or create a data type</th></tr>
-<tr>
- <td> <b>ByteType</b> </td>
- <td> Byte </td>
- <td>
- ByteType
- </td>
-</tr>
-<tr>
- <td> <b>ShortType</b> </td>
- <td> Short </td>
- <td>
- ShortType
- </td>
-</tr>
-<tr>
- <td> <b>IntegerType</b> </td>
- <td> Int </td>
- <td>
- IntegerType
- </td>
-</tr>
-<tr>
- <td> <b>LongType</b> </td>
- <td> Long </td>
- <td>
- LongType
- </td>
-</tr>
-<tr>
- <td> <b>FloatType</b> </td>
- <td> Float </td>
- <td>
- FloatType
- </td>
-</tr>
-<tr>
- <td> <b>DoubleType</b> </td>
- <td> Double </td>
- <td>
- DoubleType
- </td>
-</tr>
-<tr>
- <td> <b>DecimalType</b> </td>
- <td> java.math.BigDecimal </td>
- <td>
- DecimalType
- </td>
-</tr>
-<tr>
- <td> <b>StringType</b> </td>
- <td> String </td>
- <td>
- StringType
- </td>
-</tr>
-<tr>
- <td> <b>BinaryType</b> </td>
- <td> Array[Byte] </td>
- <td>
- BinaryType
- </td>
-</tr>
-<tr>
- <td> <b>BooleanType</b> </td>
- <td> Boolean </td>
- <td>
- BooleanType
- </td>
-</tr>
-<tr>
- <td> <b>TimestampType</b> </td>
- <td> java.sql.Timestamp </td>
- <td>
- TimestampType
- </td>
-</tr>
-<tr>
- <td> <b>DateType</b> </td>
- <td> java.sql.Date </td>
- <td>
- DateType
- </td>
-</tr>
-<tr>
- <td> <b>ArrayType</b> </td>
- <td> scala.collection.Seq </td>
- <td>
- ArrayType(<i>elementType</i>, [<i>containsNull</i>])<br />
- <b>Note:</b> The default value of <i>containsNull</i> is <i>true</i>.
- </td>
-</tr>
-<tr>
- <td> <b>MapType</b> </td>
- <td> scala.collection.Map </td>
- <td>
- MapType(<i>keyType</i>, <i>valueType</i>, [<i>valueContainsNull</i>])<br />
- <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>true</i>.
- </td>
-</tr>
-<tr>
- <td> <b>StructType</b> </td>
- <td> org.apache.spark.sql.Row </td>
- <td>
- StructType(<i>fields</i>)<br />
- <b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
- name are not allowed.
- </td>
-</tr>
-<tr>
- <td> <b>StructField</b> </td>
- <td> The value type in Scala of the data type of this field
- (For example, Int for a StructField with the data type IntegerType) </td>
- <td>
- StructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
- </td>
-</tr>
-</table>
-
- </div>
-
-<div data-lang="java">
-
- <p>All data types of Spark SQL are located in the package of
-<code>org.apache.spark.sql.types</code>. To access or create a data type,
-please use factory methods provided in
-<code>org.apache.spark.sql.types.DataTypes</code>.</p>
-
- <table class="table">
-<tr>
- <th style="width:20%">Data type</th>
- <th style="width:40%">Value type in Java</th>
- <th>API to access or create a data type</th></tr>
-<tr>
- <td> <b>ByteType</b> </td>
- <td> byte or Byte </td>
- <td>
- DataTypes.ByteType
- </td>
-</tr>
-<tr>
- <td> <b>ShortType</b> </td>
- <td> short or Short </td>
- <td>
- DataTypes.ShortType
- </td>
-</tr>
-<tr>
- <td> <b>IntegerType</b> </td>
- <td> int or Integer </td>
- <td>
- DataTypes.IntegerType
- </td>
-</tr>
-<tr>
- <td> <b>LongType</b> </td>
- <td> long or Long </td>
- <td>
- DataTypes.LongType
- </td>
-</tr>
-<tr>
- <td> <b>FloatType</b> </td>
- <td> float or Float </td>
- <td>
- DataTypes.FloatType
- </td>
-</tr>
-<tr>
- <td> <b>DoubleType</b> </td>
- <td> double or Double </td>
- <td>
- DataTypes.DoubleType
- </td>
-</tr>
-<tr>
- <td> <b>DecimalType</b> </td>
- <td> java.math.BigDecimal </td>
- <td>
- DataTypes.createDecimalType()<br />
- DataTypes.createDecimalType(<i>precision</i>, <i>scale</i>).
- </td>
-</tr>
-<tr>
- <td> <b>StringType</b> </td>
- <td> String </td>
- <td>
- DataTypes.StringType
- </td>
-</tr>
-<tr>
- <td> <b>BinaryType</b> </td>
- <td> byte[] </td>
- <td>
- DataTypes.BinaryType
- </td>
-</tr>
-<tr>
- <td> <b>BooleanType</b> </td>
- <td> boolean or Boolean </td>
- <td>
- DataTypes.BooleanType
- </td>
-</tr>
-<tr>
- <td> <b>TimestampType</b> </td>
- <td> java.sql.Timestamp </td>
- <td>
- DataTypes.TimestampType
- </td>
-</tr>
-<tr>
- <td> <b>DateType</b> </td>
- <td> java.sql.Date </td>
- <td>
- DataTypes.DateType
- </td>
-</tr>
-<tr>
- <td> <b>ArrayType</b> </td>
- <td> java.util.List </td>
- <td>
- DataTypes.createArrayType(<i>elementType</i>)<br />
- <b>Note:</b> The value of <i>containsNull</i> will be <i>true</i><br />
- DataTypes.createArrayType(<i>elementType</i>, <i>containsNull</i>).
- </td>
-</tr>
-<tr>
- <td> <b>MapType</b> </td>
- <td> java.util.Map </td>
- <td>
- DataTypes.createMapType(<i>keyType</i>, <i>valueType</i>)<br />
- <b>Note:</b> The value of <i>valueContainsNull</i> will be <i>true</i>.<br />
- DataTypes.createMapType(<i>keyType</i>, <i>valueType</i>, <i>valueContainsNull</i>)<br />
- </td>
-</tr>
-<tr>
- <td> <b>StructType</b> </td>
- <td> org.apache.spark.sql.Row </td>
- <td>
- DataTypes.createStructType(<i>fields</i>)<br />
- <b>Note:</b> <i>fields</i> is a List or an array of StructFields.
- Also, two fields with the same name are not allowed.
- </td>
-</tr>
-<tr>
- <td> <b>StructField</b> </td>
- <td> The value type in Java of the data type of this field
- (For example, int for a StructField with the data type IntegerType) </td>
- <td>
- DataTypes.createStructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
- </td>
-</tr>
-</table>
-
- </div>
-
-<div data-lang="python">
-
- <p>All data types of Spark SQL are located in the package of <code>pyspark.sql.types</code>.
-You can access them by doing</p>
-
- <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="o">*</span></code></pre></div>
-
- <table class="table">
-<tr>
- <th style="width:20%">Data type</th>
- <th style="width:40%">Value type in Python</th>
- <th>API to access or create a data type</th></tr>
-<tr>
- <td> <b>ByteType</b> </td>
- <td>
- int or long <br />
- <b>Note:</b> Numbers will be converted to 1-byte signed integer numbers at runtime.
- Please make sure that numbers are within the range of -128 to 127.
- </td>
- <td>
- ByteType()
- </td>
-</tr>
-<tr>
- <td> <b>ShortType</b> </td>
- <td>
- int or long <br />
- <b>Note:</b> Numbers will be converted to 2-byte signed integer numbers at runtime.
- Please make sure that numbers are within the range of -32768 to 32767.
- </td>
- <td>
- ShortType()
- </td>
-</tr>
-<tr>
- <td> <b>IntegerType</b> </td>
- <td> int or long </td>
- <td>
- IntegerType()
- </td>
-</tr>
-<tr>
- <td> <b>LongType</b> </td>
- <td>
- long <br />
- <b>Note:</b> Numbers will be converted to 8-byte signed integer numbers at runtime.
- Please make sure that numbers are within the range of
- -9223372036854775808 to 9223372036854775807.
- Otherwise, please convert data to decimal.Decimal and use DecimalType.
- </td>
- <td>
- LongType()
- </td>
-</tr>
-<tr>
- <td> <b>FloatType</b> </td>
- <td>
- float <br />
- <b>Note:</b> Numbers will be converted to 4-byte single-precision floating
- point numbers at runtime.
- </td>
- <td>
- FloatType()
- </td>
-</tr>
-<tr>
- <td> <b>DoubleType</b> </td>
- <td> float </td>
- <td>
- DoubleType()
- </td>
-</tr>
-<tr>
- <td> <b>DecimalType</b> </td>
- <td> decimal.Decimal </td>
- <td>
- DecimalType()
- </td>
-</tr>
-<tr>
- <td> <b>StringType</b> </td>
- <td> string </td>
- <td>
- StringType()
- </td>
-</tr>
-<tr>
- <td> <b>BinaryType</b> </td>
- <td> bytearray </td>
- <td>
- BinaryType()
- </td>
-</tr>
-<tr>
- <td> <b>BooleanType</b> </td>
- <td> bool </td>
- <td>
- BooleanType()
- </td>
-</tr>
-<tr>
- <td> <b>TimestampType</b> </td>
- <td> datetime.datetime </td>
- <td>
- TimestampType()
- </td>
-</tr>
-<tr>
- <td> <b>DateType</b> </td>
- <td> datetime.date </td>
- <td>
- DateType()
- </td>
-</tr>
-<tr>
- <td> <b>ArrayType</b> </td>
- <td> list, tuple, or array </td>
- <td>
- ArrayType(<i>elementType</i>, [<i>containsNull</i>])<br />
- <b>Note:</b> The default value of <i>containsNull</i> is <i>True</i>.
- </td>
-</tr>
-<tr>
- <td> <b>MapType</b> </td>
- <td> dict </td>
- <td>
- MapType(<i>keyType</i>, <i>valueType</i>, [<i>valueContainsNull</i>])<br />
- <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>True</i>.
- </td>
-</tr>
-<tr>
- <td> <b>StructType</b> </td>
- <td> list or tuple </td>
- <td>
- StructType(<i>fields</i>)<br />
- <b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
- name are not allowed.
- </td>
-</tr>
-<tr>
- <td> <b>StructField</b> </td>
- <td> The value type in Python of the data type of this field
- (For example, Int for a StructField with the data type IntegerType) </td>
- <td>
- StructField(<i>name</i>, <i>dataType</i>, <i>nullable</i>)
- </td>
-</tr>
-</table>
-
- </div>
-
-<div data-lang="r">
-
- <table class="table">
-<tr>
- <th style="width:20%">Data type</th>
- <th style="width:40%">Value type in R</th>
- <th>API to access or create a data type</th></tr>
-<tr>
- <td> <b>ByteType</b> </td>
- <td>
- integer <br />
- <b>Note:</b> Numbers will be converted to 1-byte signed integer numbers at runtime.
- Please make sure that numbers are within the range of -128 to 127.
- </td>
- <td>
- "byte"
- </td>
-</tr>
-<tr>
- <td> <b>ShortType</b> </td>
- <td>
- integer <br />
- <b>Note:</b> Numbers will be converted to 2-byte signed integer numbers at runtime.
- Please make sure that numbers are within the range of -32768 to 32767.
- </td>
- <td>
- "short"
- </td>
-</tr>
-<tr>
- <td> <b>IntegerType</b> </td>
- <td> integer </td>
- <td>
- "integer"
- </td>
-</tr>
-<tr>
- <td> <b>LongType</b> </td>
- <td>
- integer <br />
- <b>Note:</b> Numbers will be converted to 8-byte signed integer numbers at runtime.
- Please make sure that numbers are within the range of
- -9223372036854775808 to 9223372036854775807.
- Otherwise, please convert data to decimal.Decimal and use DecimalType.
- </td>
- <td>
- "long"
- </td>
-</tr>
-<tr>
- <td> <b>FloatType</b> </td>
- <td>
- numeric <br />
- <b>Note:</b> Numbers will be converted to 4-byte single-precision floating
- point numbers at runtime.
- </td>
- <td>
- "float"
- </td>
-</tr>
-<tr>
- <td> <b>DoubleType</b> </td>
- <td> numeric </td>
- <td>
- "double"
- </td>
-</tr>
-<tr>
- <td> <b>DecimalType</b> </td>
- <td> Not supported </td>
- <td>
- Not supported
- </td>
-</tr>
-<tr>
- <td> <b>StringType</b> </td>
- <td> character </td>
- <td>
- "string"
- </td>
-</tr>
-<tr>
- <td> <b>BinaryType</b> </td>
- <td> raw </td>
- <td>
- "binary"
- </td>
-</tr>
-<tr>
- <td> <b>BooleanType</b> </td>
- <td> logical </td>
- <td>
- "bool"
- </td>
-</tr>
-<tr>
- <td> <b>TimestampType</b> </td>
- <td> POSIXct </td>
- <td>
- "timestamp"
- </td>
-</tr>
-<tr>
- <td> <b>DateType</b> </td>
- <td> Date </td>
- <td>
- "date"
- </td>
-</tr>
-<tr>
- <td> <b>ArrayType</b> </td>
- <td> vector or list </td>
- <td>
- list(type="array", elementType=<i>elementType</i>, containsNull=[<i>containsNull</i>])<br />
- <b>Note:</b> The default value of <i>containsNull</i> is <i>True</i>.
- </td>
-</tr>
-<tr>
- <td> <b>MapType</b> </td>
- <td> environment </td>
- <td>
- list(type="map", keyType=<i>keyType</i>, valueType=<i>valueType</i>, valueContainsNull=[<i>valueContainsNull</i>])<br />
- <b>Note:</b> The default value of <i>valueContainsNull</i> is <i>True</i>.
- </td>
-</tr>
-<tr>
- <td> <b>StructType</b> </td>
- <td> named list</td>
- <td>
- list(type="struct", fields=<i>fields</i>)<br />
- <b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
- name are not allowed.
- </td>
-</tr>
-<tr>
- <td> <b>StructField</b> </td>
- <td> The value type in R of the data type of this field
- (For example, integer for a StructField with the data type IntegerType) </td>
- <td>
- list(name=<i>name</i>, type=<i>dataType</i>, nullable=<i>nullable</i>)
- </td>
-</tr>
-</table>
-
- </div>
-
-</div>
-
-<h2 id="nan-semantics">NaN Semantics</h2>
-
-<p>There is specially handling for not-a-number (NaN) when dealing with <code>float</code> or <code>double</code> types that
-does not exactly match standard floating point semantics.
-Specifically:</p>
-
-<ul>
- <li>NaN = NaN returns true.</li>
- <li>In aggregations all NaN values are grouped together.</li>
- <li>NaN is treated as a normal value in join keys.</li>
- <li>NaN values go last when in ascending order, larger than any other numeric value.</li>
-</ul>
-
-
- </div> <!-- /container -->
-
- <script src="js/vendor/jquery-1.8.0.min.js"></script>
- <script src="js/vendor/bootstrap.min.js"></script>
- <script src="js/vendor/anchor.min.js"></script>
- <script src="js/main.js"></script>
-
- <!-- MathJax Section -->
- <script type="text/x-mathjax-config">
- MathJax.Hub.Config({
- TeX: { equationNumbers: { autoNumber: "AMS" } }
- });
- </script>
- <script>
- // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
- // We could use "//cdn.mathjax...", but that won't support "file://".
- (function(d, script) {
- script = d.createElement('script');
- script.type = 'text/javascript';
- script.async = true;
- script.onload = function(){
- MathJax.Hub.Config({
- tex2jax: {
- inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
- displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
- processEscapes: true,
- skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
- }
- });
- };
- script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
- 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
- d.getElementsByTagName('head')[0].appendChild(script);
- }(document));
- </script>
- </body>
-</html>