summaryrefslogblamecommitdiff
path: root/site/docs/1.0.1/mllib-dimensionality-reduction.html
blob: 153b8fb7adec8d29c6d93b58421ad22a3f51b5ce (plain) (tree)































































































































































































































































                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
        <title>Dimensionality Reduction - MLlib - Spark 1.0.1 Documentation</title>
        <meta name="description" content="">

        

        <link rel="stylesheet" href="css/bootstrap.min.css">
        <style>
            body {
                padding-top: 60px;
                padding-bottom: 40px;
            }
        </style>
        <meta name="viewport" content="width=device-width">
        <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
        <link rel="stylesheet" href="css/main.css">

        <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>

        <link rel="stylesheet" href="css/pygments-default.css">

        
        <!-- Google analytics script -->
        <script type="text/javascript">
          var _gaq = _gaq || [];
          _gaq.push(['_setAccount', 'UA-32518208-1']);
          _gaq.push(['_trackPageview']);

          (function() {
            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
          })();
        </script>
        

    </head>
    <body>
        <!--[if lt IE 7]>
            <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
        <![endif]-->

        <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->

        <div class="navbar navbar-fixed-top" id="topbar">
            <div class="navbar-inner">
                <div class="container">
                    <div class="brand"><a href="index.html">
                      <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.0.1</span>
                    </div>
                    <ul class="nav">
                        <!--TODO(andyk): Add class="active" attribute to li some how.-->
                        <li><a href="index.html">Overview</a></li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="quick-start.html">Quick Start</a></li>
                                <li><a href="programming-guide.html">Spark Programming Guide</a></li>
                                <li class="divider"></li>
                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
                                <li><a href="sql-programming-guide.html">Spark SQL</a></li>
                                <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
                                <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
                                <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="api/scala/index.html#org.apache.spark.package">Scaladoc</a></li>
                                <li><a href="api/java/index.html">Javadoc</a></li>
                                <li><a href="api/python/index.html">Python API</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="cluster-overview.html">Overview</a></li>
                                <li><a href="submitting-applications.html">Submitting Applications</a></li>
                                <li class="divider"></li>
                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
                                <li><a href="spark-standalone.html">Standalone Mode</a></li>
                                <li><a href="running-on-mesos.html">Mesos</a></li>
                                <li><a href="running-on-yarn.html">YARN</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="configuration.html">Configuration</a></li>
                                <li><a href="monitoring.html">Monitoring</a></li>
                                <li><a href="tuning.html">Tuning Guide</a></li>
                                <li><a href="job-scheduling.html">Job Scheduling</a></li>
                                <li><a href="security.html">Security</a></li>
                                <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
                                <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
                                <li class="divider"></li>
                                <li><a href="building-with-maven.html">Building Spark with Maven</a></li>
                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
                            </ul>
                        </li>
                    </ul>
                    <!--<p class="navbar-text pull-right"><span class="version-text">v1.0.1</span></p>-->
                </div>
            </div>
        </div>

        <div class="container" id="content">
          
            <h1 class="title"><a href="mllib-guide.html">MLlib</a> - Dimensionality Reduction</h1>
          

          <ul id="markdown-toc">
  <li><a href="#singular-value-decomposition-svd">Singular value decomposition (SVD)</a></li>
  <li><a href="#principal-component-analysis-pca">Principal component analysis (PCA)</a></li>
</ul>

<p><a href="http://en.wikipedia.org/wiki/Dimensionality_reduction">Dimensionality reduction</a> is the process 
of reducing the number of variables under consideration.
It is used to extract latent features from raw and noisy features,
or compress data while maintaining the structure.
In this release, we provide preliminary support for dimensionality reduction on tall-and-skinny matrices.</p>

<h2 id="singular-value-decomposition-svd">Singular value decomposition (SVD)</h2>

<p><a href="http://en.wikipedia.org/wiki/Singular_value_decomposition">Singular value decomposition (SVD)</a>
factorizes a matrix into three matrices: $U$, $\Sigma$, and $V$ such that</p>

<p><code>\[
A = U \Sigma V^T,
\]</code></p>

<p>where </p>

<ul>
  <li>$U$ is an orthonormal matrix, whose columns are called left singular vectors,</li>
  <li>$\Sigma$ is a diagonal matrix with non-negative diagonals in descending order, 
whose diagonals are called singular values,</li>
  <li>$V$ is an orthonormal matrix, whose columns are called right singular vectors.</li>
</ul>

<p>For large matrices, usually we don&#8217;t need the complete factorization but only the top singular
values and its associated singular vectors.  This can save storage, and more importantly, de-noise
and recover the low-rank structure of the matrix.</p>

<p>If we keep the top $k$ singular values, then the dimensions of the return will be:</p>

<ul>
  <li><code>$U$</code>: <code>$m \times k$</code>,</li>
  <li><code>$\Sigma$</code>: <code>$k \times k$</code>,</li>
  <li><code>$V$</code>: <code>$n \times k$</code>.</li>
</ul>

<p>In this release, we provide SVD computation to row-oriented matrices that have only a few columns,
say, less than $1000$, but many rows, which we call <em>tall-and-skinny</em>.</p>

<div class="codetabs">
<div data-lang="scala">

    <div class="highlight"><pre><code class="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Matrix</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.distributed.RowMatrix</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.SingularValueDecomposition</span>

<span class="k">val</span> <span class="n">mat</span><span class="k">:</span> <span class="kt">RowMatrix</span> <span class="o">=</span> <span class="o">...</span>

<span class="c1">// Compute the top 20 singular values and corresponding singular vectors.</span>
<span class="k">val</span> <span class="n">svd</span><span class="k">:</span> <span class="kt">SingularValueDecomposition</span><span class="o">[</span><span class="kt">RowMatrix</span>, <span class="kt">Matrix</span><span class="o">]</span> <span class="k">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">computeSVD</span><span class="o">(</span><span class="mi">20</span><span class="o">,</span> <span class="n">computeU</span> <span class="k">=</span> <span class="kc">true</span><span class="o">)</span>
<span class="k">val</span> <span class="n">U</span><span class="k">:</span> <span class="kt">RowMatrix</span> <span class="o">=</span> <span class="n">svd</span><span class="o">.</span><span class="n">U</span> <span class="c1">// The U factor is a RowMatrix.</span>
<span class="k">val</span> <span class="n">s</span><span class="k">:</span> <span class="kt">Vector</span> <span class="o">=</span> <span class="n">svd</span><span class="o">.</span><span class="n">s</span> <span class="c1">// The singular values are stored in a local dense vector.</span>
<span class="k">val</span> <span class="n">V</span><span class="k">:</span> <span class="kt">Matrix</span> <span class="o">=</span> <span class="n">svd</span><span class="o">.</span><span class="n">V</span> <span class="c1">// The V factor is a local dense matrix.</span>
</code></pre></div>

  </div>
Same code applies to `IndexedRowMatrix`.
The only difference that the `U` matrix becomes an `IndexedRowMatrix`.
</div>

<h2 id="principal-component-analysis-pca">Principal component analysis (PCA)</h2>

<p><a href="http://en.wikipedia.org/wiki/Principal_component_analysis">Principal component analysis (PCA)</a> is a
statistical method to find a rotation such that the first coordinate has the largest variance
possible, and each succeeding coordinate in turn has the largest variance possible. The columns of
the rotation matrix are called principal components. PCA is used widely in dimensionality reduction.</p>

<p>In this release, we implement PCA for tall-and-skinny matrices stored in row-oriented format.</p>

<div class="codetabs">
<div data-lang="scala">

    <p>The following code demonstrates how to compute principal components on a tall-and-skinny <code>RowMatrix</code>
and use them to project the vectors into a low-dimensional space.
The number of columns should be small, e.g, less than 1000.</p>

    <div class="highlight"><pre><code class="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Matrix</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.distributed.RowMatrix</span>

<span class="k">val</span> <span class="n">mat</span><span class="k">:</span> <span class="kt">RowMatrix</span> <span class="o">=</span> <span class="o">...</span>

<span class="c1">// Compute the top 10 principal components.</span>
<span class="k">val</span> <span class="n">pc</span><span class="k">:</span> <span class="kt">Matrix</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">computePrincipalComponents</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> <span class="c1">// Principal components are stored in a local dense matrix.</span>

<span class="c1">// Project the rows to the linear space spanned by the top 10 principal components.</span>
<span class="k">val</span> <span class="n">projected</span><span class="k">:</span> <span class="kt">RowMatrix</span> <span class="o">=</span> <span class="n">mat</span><span class="o">.</span><span class="n">multiply</span><span class="o">(</span><span class="n">pc</span><span class="o">)</span>
</code></pre></div>

  </div>
</div>


        </div> <!-- /container -->

        <script src="js/vendor/jquery-1.8.0.min.js"></script>
        <script src="js/vendor/bootstrap.min.js"></script>
        <script src="js/main.js"></script>

        <!-- MathJax Section -->
        <script type="text/x-mathjax-config">
            MathJax.Hub.Config({
                TeX: { equationNumbers: { autoNumber: "AMS" } }
            });
        </script>
        <script>
            // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
            // We could use "//cdn.mathjax...", but that won't support "file://".
            (function(d, script) {
                script = d.createElement('script');
                script.type = 'text/javascript';
                script.async = true;
                script.onload = function(){
                    MathJax.Hub.Config({
                        tex2jax: {
                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], 
                            processEscapes: true,
                            skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                        }
                    });
                };
                script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                    'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
                d.getElementsByTagName('head')[0].appendChild(script);
            }(document));
        </script>
    </body>
</html>