summaryrefslogblamecommitdiff
path: root/site/docs/1.4.1/sparkr.html
blob: 2167baf55d808df9aa49f3ede6c451d66f68b9a5 (plain) (tree)












































































































































































































































































































































































































                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
        <title>SparkR (R on Spark) - Spark 1.4.1 Documentation</title>
        

        

        <link rel="stylesheet" href="css/bootstrap.min.css">
        <style>
            body {
                padding-top: 60px;
                padding-bottom: 40px;
            }
        </style>
        <meta name="viewport" content="width=device-width">
        <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
        <link rel="stylesheet" href="css/main.css">

        <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>

        <link rel="stylesheet" href="css/pygments-default.css">

        
        <!-- Google analytics script -->
        <script type="text/javascript">
          var _gaq = _gaq || [];
          _gaq.push(['_setAccount', 'UA-32518208-2']);
          _gaq.push(['_trackPageview']);

          (function() {
            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
          })();
        </script>
        

    </head>
    <body>
        <!--[if lt IE 7]>
            <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
        <![endif]-->

        <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->

        <div class="navbar navbar-fixed-top" id="topbar">
            <div class="navbar-inner">
                <div class="container">
                    <div class="brand"><a href="index.html">
                      <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.4.1</span>
                    </div>
                    <ul class="nav">
                        <!--TODO(andyk): Add class="active" attribute to li some how.-->
                        <li><a href="index.html">Overview</a></li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="quick-start.html">Quick Start</a></li>
                                <li><a href="programming-guide.html">Spark Programming Guide</a></li>
                                <li class="divider"></li>
                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
                                <li><a href="sql-programming-guide.html">DataFrames and SQL</a></li>
                                <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
                                <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
                                <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
                                <li><a href="sparkr.html">SparkR (R on Spark)</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
                                <li><a href="api/java/index.html">Java</a></li>
                                <li><a href="api/python/index.html">Python</a></li>
                                <li><a href="api/R/index.html">R</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="cluster-overview.html">Overview</a></li>
                                <li><a href="submitting-applications.html">Submitting Applications</a></li>
                                <li class="divider"></li>
                                <li><a href="spark-standalone.html">Spark Standalone</a></li>
                                <li><a href="running-on-mesos.html">Mesos</a></li>
                                <li><a href="running-on-yarn.html">YARN</a></li>
                                <li class="divider"></li>
                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="configuration.html">Configuration</a></li>
                                <li><a href="monitoring.html">Monitoring</a></li>
                                <li><a href="tuning.html">Tuning Guide</a></li>
                                <li><a href="job-scheduling.html">Job Scheduling</a></li>
                                <li><a href="security.html">Security</a></li>
                                <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
                                <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
                                <li class="divider"></li>
                                <li><a href="building-spark.html">Building Spark</a></li>
                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
                            </ul>
                        </li>
                    </ul>
                    <!--<p class="navbar-text pull-right"><span class="version-text">v1.4.1</span></p>-->
                </div>
            </div>
        </div>

        <div class="container" id="content">
          
            <h1 class="title">SparkR (R on Spark)</h1>
          

          <ul id="markdown-toc">
  <li><a href="#overview" id="markdown-toc-overview">Overview</a></li>
  <li><a href="#sparkr-dataframes" id="markdown-toc-sparkr-dataframes">SparkR DataFrames</a>    <ul>
      <li><a href="#starting-up-sparkcontext-sqlcontext" id="markdown-toc-starting-up-sparkcontext-sqlcontext">Starting Up: SparkContext, SQLContext</a></li>
      <li><a href="#creating-dataframes" id="markdown-toc-creating-dataframes">Creating DataFrames</a>        <ul>
          <li><a href="#from-local-data-frames" id="markdown-toc-from-local-data-frames">From local data frames</a></li>
          <li><a href="#from-data-sources" id="markdown-toc-from-data-sources">From Data Sources</a></li>
          <li><a href="#from-hive-tables" id="markdown-toc-from-hive-tables">From Hive tables</a></li>
        </ul>
      </li>
      <li><a href="#dataframe-operations" id="markdown-toc-dataframe-operations">DataFrame Operations</a>        <ul>
          <li><a href="#selecting-rows-columns" id="markdown-toc-selecting-rows-columns">Selecting rows, columns</a></li>
          <li><a href="#grouping-aggregation" id="markdown-toc-grouping-aggregation">Grouping, Aggregation</a></li>
          <li><a href="#operating-on-columns" id="markdown-toc-operating-on-columns">Operating on Columns</a></li>
        </ul>
      </li>
      <li><a href="#running-sql-queries-from-sparkr" id="markdown-toc-running-sql-queries-from-sparkr">Running SQL Queries from SparkR</a></li>
    </ul>
  </li>
</ul>

<h1 id="overview">Overview</h1>
<p>SparkR is an R package that provides a light-weight frontend to use Apache Spark from R.
In Spark 1.4.1, SparkR provides a distributed data frame implementation that
supports operations like selection, filtering, aggregation etc. (similar to R data frames,
<a href="https://github.com/hadley/dplyr">dplyr</a>) but on large datasets.</p>

<h1 id="sparkr-dataframes">SparkR DataFrames</h1>

<p>A DataFrame is a distributed collection of data organized into named columns. It is conceptually
equivalent to a table in a relational database or a data frame in R, but with richer
optimizations under the hood. DataFrames can be constructed from a wide array of sources such as:
structured data files, tables in Hive, external databases, or existing local R data frames.</p>

<p>All of the examples on this page use sample data included in R or the Spark distribution and can be run using the <code>./bin/sparkR</code> shell.</p>

<h2 id="starting-up-sparkcontext-sqlcontext">Starting Up: SparkContext, SQLContext</h2>

<div data-lang="r">
  <p>The entry point into SparkR is the <code>SparkContext</code> which connects your R program to a Spark cluster.
You can create a <code>SparkContext</code> using <code>sparkR.init</code> and pass in options such as the application name
, any spark packages depended on, etc. Further, to work with DataFrames we will need a <code>SQLContext</code>,
which can be created from the  SparkContext. If you are working from the SparkR shell, the
<code>SQLContext</code> and <code>SparkContext</code> should already be created for you.</p>

  <div class="highlight"><pre><code class="language-r" data-lang="r">sc <span class="o">&lt;-</span> sparkR.init<span class="p">()</span>
sqlContext <span class="o">&lt;-</span> sparkRSQL.init<span class="p">(</span>sc<span class="p">)</span></code></pre></div>

</div>

<h2 id="creating-dataframes">Creating DataFrames</h2>
<p>With a <code>SQLContext</code>, applications can create <code>DataFrame</code>s from a local R data frame, from a <a href="sql-programming-guide.html#hive-tables">Hive table</a>, or from other <a href="sql-programming-guide.html#data-sources">data sources</a>.</p>

<h3 id="from-local-data-frames">From local data frames</h3>
<p>The simplest way to create a data frame is to convert a local R data frame into a SparkR DataFrame. Specifically we can use <code>createDataFrame</code> and pass in the local R data frame to create a SparkR DataFrame. As an example, the following creates a <code>DataFrame</code> based using the <code>faithful</code> dataset from R.</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r">df <span class="o">&lt;-</span> createDataFrame<span class="p">(</span>sqlContext<span class="p">,</span> faithful<span class="p">)</span> 

<span class="c1"># Displays the content of the DataFrame to stdout</span>
<span class="kp">head</span><span class="p">(</span>df<span class="p">)</span>
<span class="c1">##  eruptions waiting</span>
<span class="c1">##1     3.600      79</span>
<span class="c1">##2     1.800      54</span>
<span class="c1">##3     3.333      74</span></code></pre></div>

</div>

<h3 id="from-data-sources">From Data Sources</h3>

<p>SparkR supports operating on a variety of data sources through the <code>DataFrame</code> interface. This section describes the general methods for loading and saving data using Data Sources. You can check the Spark SQL programming guide for more <a href="sql-programming-guide.html#manually-specifying-options">specific options</a> that are available for the built-in data sources.</p>

<p>The general method for creating DataFrames from data sources is <code>read.df</code>. This method takes in the <code>SQLContext</code>, the path for the file to load and the type of data source. SparkR supports reading JSON and Parquet files natively and through <a href="http://spark-packages.org/">Spark Packages</a> you can find data source connectors for popular file formats like <a href="http://spark-packages.org/package/databricks/spark-csv">CSV</a> and <a href="http://spark-packages.org/package/databricks/spark-avro">Avro</a>. These packages can either be added by
specifying <code>--packages</code> with <code>spark-submit</code> or <code>sparkR</code> commands, or if creating context through <code>init</code>
you can specify the packages with the <code>packages</code> argument.</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r">sc <span class="o">&lt;-</span> sparkR.init<span class="p">(</span>sparkPackages<span class="o">=</span><span class="s">&quot;com.databricks:spark-csv_2.11:1.0.3&quot;</span><span class="p">)</span>
sqlContext <span class="o">&lt;-</span> sparkRSQL.init<span class="p">(</span>sc<span class="p">)</span></code></pre></div>

</div>

<p>We can see how to use data sources using an example JSON input file. Note that the file that is used here is <em>not</em> a typical JSON file. Each line in the file must contain a separate, self-contained valid JSON object. As a consequence, a regular multi-line JSON file will most often fail.</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r">people <span class="o">&lt;-</span> read.df<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;./examples/src/main/resources/people.json&quot;</span><span class="p">,</span> <span class="s">&quot;json&quot;</span><span class="p">)</span>
<span class="kp">head</span><span class="p">(</span>people<span class="p">)</span>
<span class="c1">##  age    name</span>
<span class="c1">##1  NA Michael</span>
<span class="c1">##2  30    Andy</span>
<span class="c1">##3  19  Justin</span>

<span class="c1"># SparkR automatically infers the schema from the JSON file</span>
printSchema<span class="p">(</span>people<span class="p">)</span>
<span class="c1"># root</span>
<span class="c1">#  |-- age: integer (nullable = true)</span>
<span class="c1">#  |-- name: string (nullable = true)</span></code></pre></div>

</div>

<p>The data sources API can also be used to save out DataFrames into multiple file formats. For example we can save the DataFrame from the previous example
to a Parquet file using <code>write.df</code></p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r">write.df<span class="p">(</span>people<span class="p">,</span> path<span class="o">=</span><span class="s">&quot;people.parquet&quot;</span><span class="p">,</span> <span class="kn">source</span><span class="o">=</span><span class="s">&quot;parquet&quot;</span><span class="p">,</span> mode<span class="o">=</span><span class="s">&quot;overwrite&quot;</span><span class="p">)</span></code></pre></div>

</div>

<h3 id="from-hive-tables">From Hive tables</h3>

<p>You can also create SparkR DataFrames from Hive tables. To do this we will need to create a HiveContext which can access tables in the Hive MetaStore. Note that Spark should have been built with <a href="building-spark.html#building-with-hive-and-jdbc-support">Hive support</a> and more details on the difference between SQLContext and HiveContext can be found in the <a href="sql-programming-guide.html#starting-point-sqlcontext">SQL programming guide</a>.</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># sc is an existing SparkContext.</span>
hiveContext <span class="o">&lt;-</span> sparkRHive.init<span class="p">(</span>sc<span class="p">)</span>

sql<span class="p">(</span>hiveContext<span class="p">,</span> <span class="s">&quot;CREATE TABLE IF NOT EXISTS src (key INT, value STRING)&quot;</span><span class="p">)</span>
sql<span class="p">(</span>hiveContext<span class="p">,</span> <span class="s">&quot;LOAD DATA LOCAL INPATH &#39;examples/src/main/resources/kv1.txt&#39; INTO TABLE src&quot;</span><span class="p">)</span>

<span class="c1"># Queries can be expressed in HiveQL.</span>
results <span class="o">&lt;-</span> sql<span class="p">(</span>hiveContext<span class="p">,</span> <span class="s">&quot;FROM src SELECT key, value&quot;</span><span class="p">)</span>

<span class="c1"># results is now a DataFrame</span>
<span class="kp">head</span><span class="p">(</span>results<span class="p">)</span>
<span class="c1">##  key   value</span>
<span class="c1">## 1 238 val_238</span>
<span class="c1">## 2  86  val_86</span>
<span class="c1">## 3 311 val_311</span></code></pre></div>

</div>

<h2 id="dataframe-operations">DataFrame Operations</h2>

<p>SparkR DataFrames support a number of functions to do structured data processing.
Here we include some basic examples and a complete list can be found in the <a href="api/R/index.html">API</a> docs:</p>

<h3 id="selecting-rows-columns">Selecting rows, columns</h3>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Create the DataFrame</span>
df <span class="o">&lt;-</span> createDataFrame<span class="p">(</span>sqlContext<span class="p">,</span> faithful<span class="p">)</span> 

<span class="c1"># Get basic information about the DataFrame</span>
df
<span class="c1">## DataFrame[eruptions:double, waiting:double]</span>

<span class="c1"># Select only the &quot;eruptions&quot; column</span>
<span class="kp">head</span><span class="p">(</span>select<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>eruptions<span class="p">))</span>
<span class="c1">##  eruptions</span>
<span class="c1">##1     3.600</span>
<span class="c1">##2     1.800</span>
<span class="c1">##3     3.333</span>

<span class="c1"># You can also pass in column name as strings </span>
<span class="kp">head</span><span class="p">(</span>select<span class="p">(</span>df<span class="p">,</span> <span class="s">&quot;eruptions&quot;</span><span class="p">))</span>

<span class="c1"># Filter the DataFrame to only retain rows with wait times shorter than 50 mins</span>
<span class="kp">head</span><span class="p">(</span>filter<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>waiting <span class="o">&lt;</span> <span class="m">50</span><span class="p">))</span>
<span class="c1">##  eruptions waiting</span>
<span class="c1">##1     1.750      47</span>
<span class="c1">##2     1.750      47</span>
<span class="c1">##3     1.867      48</span></code></pre></div>

</div>

<h3 id="grouping-aggregation">Grouping, Aggregation</h3>

<p>SparkR data frames support a number of commonly used functions to aggregate data after grouping. For example we can compute a histogram of the <code>waiting</code> time in the <code>faithful</code> dataset as shown below</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># We use the `n` operator to count the number of times each waiting time appears</span>
<span class="kp">head</span><span class="p">(</span>summarize<span class="p">(</span>groupBy<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>waiting<span class="p">),</span> count <span class="o">=</span> n<span class="p">(</span>df<span class="o">$</span>waiting<span class="p">)))</span>
<span class="c1">##  waiting count</span>
<span class="c1">##1      81    13</span>
<span class="c1">##2      60     6</span>
<span class="c1">##3      68     1</span>

<span class="c1"># We can also sort the output from the aggregation to get the most common waiting times</span>
waiting_counts <span class="o">&lt;-</span> summarize<span class="p">(</span>groupBy<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>waiting<span class="p">),</span> count <span class="o">=</span> n<span class="p">(</span>df<span class="o">$</span>waiting<span class="p">))</span>
<span class="kp">head</span><span class="p">(</span>arrange<span class="p">(</span>waiting_counts<span class="p">,</span> desc<span class="p">(</span>waiting_counts<span class="o">$</span>count<span class="p">)))</span>

<span class="c1">##   waiting count</span>
<span class="c1">##1      78    15</span>
<span class="c1">##2      83    14</span>
<span class="c1">##3      81    13</span></code></pre></div>

</div>

<h3 id="operating-on-columns">Operating on Columns</h3>

<p>SparkR also provides a number of functions that can directly applied to columns for data processing and during aggregation. The example below shows the use of basic arithmetic functions.</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Convert waiting time from hours to seconds.</span>
<span class="c1"># Note that we can assign this to a new column in the same DataFrame</span>
df<span class="o">$</span>waiting_secs <span class="o">&lt;-</span> df<span class="o">$</span>waiting <span class="o">*</span> <span class="m">60</span>
<span class="kp">head</span><span class="p">(</span>df<span class="p">)</span>
<span class="c1">##  eruptions waiting waiting_secs</span>
<span class="c1">##1     3.600      79         4740</span>
<span class="c1">##2     1.800      54         3240</span>
<span class="c1">##3     3.333      74         4440</span></code></pre></div>

</div>

<h2 id="running-sql-queries-from-sparkr">Running SQL Queries from SparkR</h2>
<p>A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data.
The <code>sql</code> function enables applications to run SQL queries programmatically and returns the result as a <code>DataFrame</code>.</p>

<div data-lang="r">

  <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Load a JSON file</span>
people <span class="o">&lt;-</span> read.df<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;./examples/src/main/resources/people.json&quot;</span><span class="p">,</span> <span class="s">&quot;json&quot;</span><span class="p">)</span>

<span class="c1"># Register this DataFrame as a table.</span>
registerTempTable<span class="p">(</span>people<span class="p">,</span> <span class="s">&quot;people&quot;</span><span class="p">)</span>

<span class="c1"># SQL statements can be run by using the sql method</span>
teenagers <span class="o">&lt;-</span> sql<span class="p">(</span>sqlContext<span class="p">,</span> <span class="s">&quot;SELECT name FROM people WHERE age &gt;= 13 AND age &lt;= 19&quot;</span><span class="p">)</span>
<span class="kp">head</span><span class="p">(</span>teenagers<span class="p">)</span>
<span class="c1">##    name</span>
<span class="c1">##1 Justin</span></code></pre></div>

</div>


        </div> <!-- /container -->

        <script src="js/vendor/jquery-1.8.0.min.js"></script>
        <script src="js/vendor/bootstrap.min.js"></script>
        <script src="js/vendor/anchor.min.js"></script>
        <script src="js/main.js"></script>

        <!-- MathJax Section -->
        <script type="text/x-mathjax-config">
            MathJax.Hub.Config({
                TeX: { equationNumbers: { autoNumber: "AMS" } }
            });
        </script>
        <script>
            // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
            // We could use "//cdn.mathjax...", but that won't support "file://".
            (function(d, script) {
                script = d.createElement('script');
                script.type = 'text/javascript';
                script.async = true;
                script.onload = function(){
                    MathJax.Hub.Config({
                        tex2jax: {
                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                            processEscapes: true,
                            skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                        }
                    });
                };
                script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                    'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
                d.getElementsByTagName('head')[0].appendChild(script);
            }(document));
        </script>
    </body>
</html>