summaryrefslogblamecommitdiff
path: root/site/docs/1.0.1/mllib-naive-bayes.html
blob: 7cc589e40c796dc629967b0b4cbb103ff283eb0e (plain) (tree)






























































































































































































































































































                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
        <title>Naive Bayes - MLlib - Spark 1.0.1 Documentation</title>
        <meta name="description" content="">

        

        <link rel="stylesheet" href="css/bootstrap.min.css">
        <style>
            body {
                padding-top: 60px;
                padding-bottom: 40px;
            }
        </style>
        <meta name="viewport" content="width=device-width">
        <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
        <link rel="stylesheet" href="css/main.css">

        <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>

        <link rel="stylesheet" href="css/pygments-default.css">

        
        <!-- Google analytics script -->
        <script type="text/javascript">
          var _gaq = _gaq || [];
          _gaq.push(['_setAccount', 'UA-32518208-1']);
          _gaq.push(['_trackPageview']);

          (function() {
            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
          })();
        </script>
        

    </head>
    <body>
        <!--[if lt IE 7]>
            <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
        <![endif]-->

        <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->

        <div class="navbar navbar-fixed-top" id="topbar">
            <div class="navbar-inner">
                <div class="container">
                    <div class="brand"><a href="index.html">
                      <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.0.1</span>
                    </div>
                    <ul class="nav">
                        <!--TODO(andyk): Add class="active" attribute to li some how.-->
                        <li><a href="index.html">Overview</a></li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="quick-start.html">Quick Start</a></li>
                                <li><a href="programming-guide.html">Spark Programming Guide</a></li>
                                <li class="divider"></li>
                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
                                <li><a href="sql-programming-guide.html">Spark SQL</a></li>
                                <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
                                <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
                                <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="api/scala/index.html#org.apache.spark.package">Scaladoc</a></li>
                                <li><a href="api/java/index.html">Javadoc</a></li>
                                <li><a href="api/python/index.html">Python API</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="cluster-overview.html">Overview</a></li>
                                <li><a href="submitting-applications.html">Submitting Applications</a></li>
                                <li class="divider"></li>
                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
                                <li><a href="spark-standalone.html">Standalone Mode</a></li>
                                <li><a href="running-on-mesos.html">Mesos</a></li>
                                <li><a href="running-on-yarn.html">YARN</a></li>
                            </ul>
                        </li>

                        <li class="dropdown">
                            <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
                            <ul class="dropdown-menu">
                                <li><a href="configuration.html">Configuration</a></li>
                                <li><a href="monitoring.html">Monitoring</a></li>
                                <li><a href="tuning.html">Tuning Guide</a></li>
                                <li><a href="job-scheduling.html">Job Scheduling</a></li>
                                <li><a href="security.html">Security</a></li>
                                <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
                                <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
                                <li class="divider"></li>
                                <li><a href="building-with-maven.html">Building Spark with Maven</a></li>
                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
                            </ul>
                        </li>
                    </ul>
                    <!--<p class="navbar-text pull-right"><span class="version-text">v1.0.1</span></p>-->
                </div>
            </div>
        </div>

        <div class="container" id="content">
          
            <h1 class="title"><a href="mllib-guide.html">MLlib</a> - Naive Bayes</h1>
          

          <p>Naive Bayes is a simple multiclass classification algorithm with the assumption of independence
between every pair of features. Naive Bayes can be trained very efficiently. Within a single pass to
the training data, it computes the conditional probability distribution of each feature given label,
and then it applies Bayes&#8217; theorem to compute the conditional probability distribution of label
given an observation and use it for prediction. For more details, please visit the Wikipedia page
<a href="http://en.wikipedia.org/wiki/Naive_Bayes_classifier">Naive Bayes classifier</a>.</p>

<p>In MLlib, we implemented multinomial naive Bayes, which is typically used for document
classification. Within that context, each observation is a document, each feature represents a term,
whose value is the frequency of the term. For its formulation, please visit the Wikipedia page
<a href="http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes">Multinomial Naive Bayes</a>
or the section
<a href="http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html">Naive Bayes text classification</a>
from the book Introduction to Information
Retrieval. <a href="http://en.wikipedia.org/wiki/Lidstone_smoothing">Additive smoothing</a> can be used by
setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature
vectors are usually sparse. Please supply sparse vectors as input to take advantage of
sparsity. Since the training data is only used once, it is not necessary to cache it.</p>

<h2 id="examples">Examples</h2>

<div class="codetabs">
<div data-lang="scala">

    <p><a href="api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$">NaiveBayes</a> implements
multinomial naive Bayes. It takes an RDD of
<a href="api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint">LabeledPoint</a> and an optional
smoothing parameter <code>lambda</code> as input, and output a
<a href="api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel">NaiveBayesModel</a>, which
can be used for evaluation and prediction.</p>

    <div class="highlight"><pre><code class="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.classification.NaiveBayes</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
<span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>

<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;mllib/data/sample_naive_bayes_data.txt&quot;</span><span class="o">)</span>
<span class="k">val</span> <span class="n">parsedData</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="n">line</span> <span class="k">=&gt;</span>
  <span class="k">val</span> <span class="n">parts</span> <span class="k">=</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="sc">&#39;,&#39;</span><span class="o">)</span>
  <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">parts</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">toDouble</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="n">parts</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">split</span><span class="o">(</span><span class="sc">&#39; &#39;</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">toDouble</span><span class="o">)))</span>
<span class="o">}</span>
<span class="c1">// Split data into training (60%) and test (40%).</span>
<span class="k">val</span> <span class="n">splits</span> <span class="k">=</span> <span class="n">parsedData</span><span class="o">.</span><span class="n">randomSplit</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">),</span> <span class="n">seed</span> <span class="k">=</span> <span class="mi">11L</span><span class="o">)</span>
<span class="k">val</span> <span class="n">training</span> <span class="k">=</span> <span class="n">splits</span><span class="o">(</span><span class="mi">0</span><span class="o">)</span>
<span class="k">val</span> <span class="n">test</span> <span class="k">=</span> <span class="n">splits</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span>

<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="nc">NaiveBayes</span><span class="o">.</span><span class="n">train</span><span class="o">(</span><span class="n">training</span><span class="o">,</span> <span class="n">lambda</span> <span class="k">=</span> <span class="mf">1.0</span><span class="o">)</span>
<span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">test</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">features</span><span class="o">))</span>

<span class="k">val</span> <span class="n">predictionAndLabel</span> <span class="k">=</span> <span class="n">prediction</span><span class="o">.</span><span class="n">zip</span><span class="o">(</span><span class="n">test</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">label</span><span class="o">))</span>
<span class="k">val</span> <span class="n">accuracy</span> <span class="k">=</span> <span class="mf">1.0</span> <span class="o">*</span> <span class="n">predictionAndLabel</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="n">x</span> <span class="k">=&gt;</span> <span class="n">x</span><span class="o">.</span><span class="n">_1</span> <span class="o">==</span> <span class="n">x</span><span class="o">.</span><span class="n">_2</span><span class="o">).</span><span class="n">count</span><span class="o">()</span> <span class="o">/</span> <span class="n">test</span><span class="o">.</span><span class="n">count</span><span class="o">()</span>
</code></pre></div>

  </div>

<div data-lang="java">

    <p><a href="api/java/org/apache/spark/mllib/classification/NaiveBayes.html">NaiveBayes</a> implements
multinomial naive Bayes. It takes a Scala RDD of
<a href="api/java/org/apache/spark/mllib/regression/LabeledPoint.html">LabeledPoint</a> and an
optionally smoothing parameter <code>lambda</code> as input, and output a
<a href="api/java/org/apache/spark/mllib/classification/NaiveBayesModel.html">NaiveBayesModel</a>, which
can be used for evaluation and prediction.</p>

    <div class="highlight"><pre><code class="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaPairRDD</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.NaiveBayes</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.NaiveBayesModel</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>

<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">training</span> <span class="o">=</span> <span class="o">...</span> <span class="c1">// training set</span>
<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">test</span> <span class="o">=</span> <span class="o">...</span> <span class="c1">// test set</span>

<span class="kd">final</span> <span class="n">NaiveBayesModel</span> <span class="n">model</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="o">.</span><span class="na">train</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="na">rdd</span><span class="o">(),</span> <span class="mf">1.0</span><span class="o">);</span>

<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Double</span><span class="o">&gt;</span> <span class="n">prediction</span> <span class="o">=</span>
  <span class="n">test</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">Double</span><span class="o">&gt;()</span> <span class="o">{</span>
    <span class="nd">@Override</span> <span class="kd">public</span> <span class="n">Double</span> <span class="n">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">p</span><span class="o">)</span> <span class="o">{</span>
      <span class="k">return</span> <span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="na">features</span><span class="o">());</span>
    <span class="o">}</span>
  <span class="o">});</span>
<span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">Double</span><span class="o">,</span> <span class="n">Double</span><span class="o">&gt;</span> <span class="n">predictionAndLabel</span> <span class="o">=</span> 
  <span class="n">prediction</span><span class="o">.</span><span class="na">zip</span><span class="o">(</span><span class="n">test</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">Double</span><span class="o">&gt;()</span> <span class="o">{</span>
    <span class="nd">@Override</span> <span class="kd">public</span> <span class="n">Double</span> <span class="n">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">p</span><span class="o">)</span> <span class="o">{</span>
      <span class="k">return</span> <span class="n">p</span><span class="o">.</span><span class="na">label</span><span class="o">();</span>
    <span class="o">}</span>
  <span class="o">}));</span>
<span class="kt">double</span> <span class="n">accuracy</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">*</span> <span class="n">predictionAndLabel</span><span class="o">.</span><span class="na">filter</span><span class="o">(</span><span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Double</span><span class="o">,</span> <span class="n">Double</span><span class="o">&gt;,</span> <span class="n">Boolean</span><span class="o">&gt;()</span> <span class="o">{</span>
    <span class="nd">@Override</span> <span class="kd">public</span> <span class="n">Boolean</span> <span class="n">call</span><span class="o">(</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Double</span><span class="o">,</span> <span class="n">Double</span><span class="o">&gt;</span> <span class="n">pl</span><span class="o">)</span> <span class="o">{</span>
      <span class="k">return</span> <span class="n">pl</span><span class="o">.</span><span class="na">_1</span><span class="o">()</span> <span class="o">==</span> <span class="n">pl</span><span class="o">.</span><span class="na">_2</span><span class="o">();</span>
    <span class="o">}</span>
  <span class="o">}).</span><span class="na">count</span><span class="o">()</span> <span class="o">/</span> <span class="n">test</span><span class="o">.</span><span class="na">count</span><span class="o">();</span>
</code></pre></div>

  </div>

<div data-lang="python">

    <p><a href="api/python/pyspark.mllib.classification.NaiveBayes-class.html">NaiveBayes</a> implements multinomial
naive Bayes. It takes an RDD of
<a href="api/python/pyspark.mllib.regression.LabeledPoint-class.html">LabeledPoint</a> and an optionally
smoothing parameter <code>lambda</code> as input, and output a
<a href="api/python/pyspark.mllib.classification.NaiveBayesModel-class.html">NaiveBayesModel</a>, which can be
used for evaluation and prediction.</p>

    <!-- TODO: Make Python's example consistent with Scala's and Java's. -->

    <div class="highlight"><pre><code class="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">LabeledPoint</span>
<span class="kn">from</span> <span class="nn">pyspark.mllib.classification</span> <span class="kn">import</span> <span class="n">NaiveBayes</span>

<span class="c"># an RDD of LabeledPoint</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
  <span class="n">LabeledPoint</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
  <span class="o">...</span> <span class="c"># more labeled points</span>
<span class="p">])</span>

<span class="c"># Train a naive Bayes model.</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">NaiveBayes</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)</span>

<span class="c"># Make prediction.</span>
<span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>
</code></pre></div>

  </div>
</div>


        </div> <!-- /container -->

        <script src="js/vendor/jquery-1.8.0.min.js"></script>
        <script src="js/vendor/bootstrap.min.js"></script>
        <script src="js/main.js"></script>

        <!-- MathJax Section -->
        <script type="text/x-mathjax-config">
            MathJax.Hub.Config({
                TeX: { equationNumbers: { autoNumber: "AMS" } }
            });
        </script>
        <script>
            // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
            // We could use "//cdn.mathjax...", but that won't support "file://".
            (function(d, script) {
                script = d.createElement('script');
                script.type = 'text/javascript';
                script.async = true;
                script.onload = function(){
                    MathJax.Hub.Config({
                        tex2jax: {
                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], 
                            processEscapes: true,
                            skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                        }
                    });
                };
                script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                    'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
                d.getElementsByTagName('head')[0].appendChild(script);
            }(document));
        </script>
    </body>
</html>