summaryrefslogtreecommitdiff
path: root/site/docs/1.5.0/mllib-evaluation-metrics.html
diff options
context:
space:
mode:
authorReynold Xin <rxin@apache.org>2015-09-17 22:11:21 +0000
committerReynold Xin <rxin@apache.org>2015-09-17 22:11:21 +0000
commit6f57b0c45a7d1b6255067c6e9bc549baa491acac (patch)
treedbf7d7a7700e9e6bad3c8289ab831bc9c2c20d62 /site/docs/1.5.0/mllib-evaluation-metrics.html
parentee9ffe89d608e7640a2487406b618d27e58026d6 (diff)
downloadspark-website-6f57b0c45a7d1b6255067c6e9bc549baa491acac.tar.gz
spark-website-6f57b0c45a7d1b6255067c6e9bc549baa491acac.tar.bz2
spark-website-6f57b0c45a7d1b6255067c6e9bc549baa491acac.zip
add 1.5.0 back
Diffstat (limited to 'site/docs/1.5.0/mllib-evaluation-metrics.html')
-rw-r--r--site/docs/1.5.0/mllib-evaluation-metrics.html1640
1 files changed, 1640 insertions, 0 deletions
diff --git a/site/docs/1.5.0/mllib-evaluation-metrics.html b/site/docs/1.5.0/mllib-evaluation-metrics.html
new file mode 100644
index 000000000..27d418abf
--- /dev/null
+++ b/site/docs/1.5.0/mllib-evaluation-metrics.html
@@ -0,0 +1,1640 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+ <title>Evaluation Metrics - MLlib - Spark 1.5.0 Documentation</title>
+
+
+
+
+ <link rel="stylesheet" href="css/bootstrap.min.css">
+ <style>
+ body {
+ padding-top: 60px;
+ padding-bottom: 40px;
+ }
+ </style>
+ <meta name="viewport" content="width=device-width">
+ <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
+ <link rel="stylesheet" href="css/main.css">
+
+ <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
+
+ <link rel="stylesheet" href="css/pygments-default.css">
+
+
+ <!-- Google analytics script -->
+ <script type="text/javascript">
+ var _gaq = _gaq || [];
+ _gaq.push(['_setAccount', 'UA-32518208-2']);
+ _gaq.push(['_trackPageview']);
+
+ (function() {
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+ })();
+ </script>
+
+
+ </head>
+ <body>
+ <!--[if lt IE 7]>
+ <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
+ <![endif]-->
+
+ <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
+
+ <div class="navbar navbar-fixed-top" id="topbar">
+ <div class="navbar-inner">
+ <div class="container">
+ <div class="brand"><a href="index.html">
+ <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.5.0</span>
+ </div>
+ <ul class="nav">
+ <!--TODO(andyk): Add class="active" attribute to li some how.-->
+ <li><a href="index.html">Overview</a></li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="quick-start.html">Quick Start</a></li>
+ <li><a href="programming-guide.html">Spark Programming Guide</a></li>
+ <li class="divider"></li>
+ <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+ <li><a href="sql-programming-guide.html">DataFrames and SQL</a></li>
+ <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
+ <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
+ <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+ <li><a href="sparkr.html">SparkR (R on Spark)</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
+ <li><a href="api/java/index.html">Java</a></li>
+ <li><a href="api/python/index.html">Python</a></li>
+ <li><a href="api/R/index.html">R</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="cluster-overview.html">Overview</a></li>
+ <li><a href="submitting-applications.html">Submitting Applications</a></li>
+ <li class="divider"></li>
+ <li><a href="spark-standalone.html">Spark Standalone</a></li>
+ <li><a href="running-on-mesos.html">Mesos</a></li>
+ <li><a href="running-on-yarn.html">YARN</a></li>
+ <li class="divider"></li>
+ <li><a href="ec2-scripts.html">Amazon EC2</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="configuration.html">Configuration</a></li>
+ <li><a href="monitoring.html">Monitoring</a></li>
+ <li><a href="tuning.html">Tuning Guide</a></li>
+ <li><a href="job-scheduling.html">Job Scheduling</a></li>
+ <li><a href="security.html">Security</a></li>
+ <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
+ <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
+ <li class="divider"></li>
+ <li><a href="building-spark.html">Building Spark</a></li>
+ <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+ <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
+ </ul>
+ </li>
+ </ul>
+ <!--<p class="navbar-text pull-right"><span class="version-text">v1.5.0</span></p>-->
+ </div>
+ </div>
+ </div>
+
+ <div class="container" id="content">
+
+ <h1 class="title"><a href="mllib-guide.html">MLlib</a> - Evaluation Metrics</h1>
+
+
+ <ul id="markdown-toc">
+ <li><a href="#classification-model-evaluation">Classification model evaluation</a> <ul>
+ <li><a href="#binary-classification">Binary classification</a> <ul>
+ <li><a href="#threshold-tuning">Threshold tuning</a></li>
+ </ul>
+ </li>
+ <li><a href="#multiclass-classification">Multiclass classification</a> <ul>
+ <li><a href="#label-based-metrics">Label based metrics</a></li>
+ </ul>
+ </li>
+ <li><a href="#multilabel-classification">Multilabel classification</a></li>
+ <li><a href="#ranking-systems">Ranking systems</a></li>
+ </ul>
+ </li>
+ <li><a href="#regression-model-evaluation">Regression model evaluation</a></li>
+</ul>
+
+<p>Spark&#8217;s MLlib comes with a number of machine learning algorithms that can be used to learn from and make predictions
+on data. When these algorithms are applied to build machine learning models, there is a need to evaluate the performance
+of the model on some criteria, which depends on the application and its requirements. Spark&#8217;s MLlib also provides a
+suite of metrics for the purpose of evaluating the performance of machine learning models.</p>
+
+<p>Specific machine learning algorithms fall under broader types of machine learning applications like classification,
+regression, clustering, etc. Each of these types have well established metrics for performance evaluation and those
+metrics that are currently available in Spark&#8217;s MLlib are detailed in this section.</p>
+
+<h2 id="classification-model-evaluation">Classification model evaluation</h2>
+
+<p>While there are many different types of classification algorithms, the evaluation of classification models all share
+similar principles. In a <a href="https://en.wikipedia.org/wiki/Statistical_classification">supervised classification problem</a>,
+there exists a true output and a model-generated predicted output for each data point. For this reason, the results for
+each data point can be assigned to one of four categories:</p>
+
+<ul>
+ <li>True Positive (TP) - label is positive and prediction is also positive</li>
+ <li>True Negative (TN) - label is negative and prediction is also negative</li>
+ <li>False Positive (FP) - label is negative but prediction is positive</li>
+ <li>False Negative (FN) - label is positive but prediction is negative</li>
+</ul>
+
+<p>These four numbers are the building blocks for most classifier evaluation metrics. A fundamental point when considering
+classifier evaluation is that pure accuracy (i.e. was the prediction correct or incorrect) is not generally a good metric. The
+reason for this is because a dataset may be highly unbalanced. For example, if a model is designed to predict fraud from
+a dataset where 95% of the data points are <em>not fraud</em> and 5% of the data points are <em>fraud</em>, then a naive classifier
+that predicts <em>not fraud</em>, regardless of input, will be 95% accurate. For this reason, metrics like
+<a href="https://en.wikipedia.org/wiki/Precision_and_recall">precision and recall</a> are typically used because they take into
+account the <em>type</em> of error. In most applications there is some desired balance between precision and recall, which can
+be captured by combining the two into a single metric, called the <a href="https://en.wikipedia.org/wiki/F1_score">F-measure</a>.</p>
+
+<h3 id="binary-classification">Binary classification</h3>
+
+<p><a href="https://en.wikipedia.org/wiki/Binary_classification">Binary classifiers</a> are used to separate the elements of a given
+dataset into one of two possible groups (e.g. fraud or not fraud) and is a special case of multiclass classification.
+Most binary classification metrics can be generalized to multiclass classification metrics.</p>
+
+<h4 id="threshold-tuning">Threshold tuning</h4>
+
+<p>It is import to understand that many classification models actually output a &#8220;score&#8221; (often times a probability) for
+each class, where a higher score indicates higher likelihood. In the binary case, the model may output a probability for
+each class: $P(Y=1|X)$ and $P(Y=0|X)$. Instead of simply taking the higher probability, there may be some cases where
+the model might need to be tuned so that it only predicts a class when the probability is very high (e.g. only block a
+credit card transaction if the model predicts fraud with &gt;90% probability). Therefore, there is a prediction <em>threshold</em>
+which determines what the predicted class will be based on the probabilities that the model outputs.</p>
+
+<p>Tuning the prediction threshold will change the precision and recall of the model and is an important part of model
+optimization. In order to visualize how precision, recall, and other metrics change as a function of the threshold it is
+common practice to plot competing metrics against one another, parameterized by threshold. A P-R curve plots (precision,
+recall) points for different threshold values, while a
+<a href="https://en.wikipedia.org/wiki/Receiver_operating_characteristic">receiver operating characteristic</a>, or ROC, curve
+plots (recall, false positive rate) points.</p>
+
+<p><strong>Available metrics</strong></p>
+
+<table class="table">
+ <thead>
+ <tr><th>Metric</th><th>Definition</th></tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Precision (Postive Predictive Value)</td>
+ <td>$PPV=\frac{TP}{TP + FP}$</td>
+ </tr>
+ <tr>
+ <td>Recall (True Positive Rate)</td>
+ <td>$TPR=\frac{TP}{P}=\frac{TP}{TP + FN}$</td>
+ </tr>
+ <tr>
+ <td>F-measure</td>
+ <td>$F(\beta) = \left(1 + \beta^2\right) \cdot \left(\frac{PPV \cdot TPR}
+ {\beta^2 \cdot PPV + TPR}\right)$</td>
+ </tr>
+ <tr>
+ <td>Receiver Operating Characteristic (ROC)</td>
+ <td>$FPR(T)=\int^\infty_{T} P_0(T)\,dT \\ TPR(T)=\int^\infty_{T} P_1(T)\,dT$</td>
+ </tr>
+ <tr>
+ <td>Area Under ROC Curve</td>
+ <td>$AUROC=\int^1_{0} \frac{TP}{P} d\left(\frac{FP}{N}\right)$</td>
+ </tr>
+ <tr>
+ <td>Area Under Precision-Recall Curve</td>
+ <td>$AUPRC=\int^1_{0} \frac{TP}{TP+FP} d\left(\frac{TP}{P}\right)$</td>
+ </tr>
+ </tbody>
+</table>
+
+<p><strong>Examples</strong></p>
+
+<div class="codetabs">
+The following code snippets illustrate how to load a sample dataset, train a binary classification algorithm on the
+data, and evaluate the performance of the algorithm by several binary evaluation metrics.
+
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.BinaryClassificationMetrics</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="c1">// Load training data in LIBSVM format</span>
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_binary_classification_data.txt&quot;</span><span class="o">)</span>
+
+<span class="c1">// Split data into training (60%) and test (40%)</span>
+<span class="k">val</span> <span class="nc">Array</span><span class="o">(</span><span class="n">training</span><span class="o">,</span> <span class="n">test</span><span class="o">)</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">randomSplit</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">),</span> <span class="n">seed</span> <span class="k">=</span> <span class="mi">11L</span><span class="o">)</span>
+<span class="n">training</span><span class="o">.</span><span class="n">cache</span><span class="o">()</span>
+
+<span class="c1">// Run training algorithm to build the model</span>
+<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegressionWithLBFGS</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setNumClasses</span><span class="o">(</span><span class="mi">2</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">run</span><span class="o">(</span><span class="n">training</span><span class="o">)</span>
+
+<span class="c1">// Clear the prediction threshold so the model will return probabilities</span>
+<span class="n">model</span><span class="o">.</span><span class="n">clearThreshold</span>
+
+<span class="c1">// Compute raw scores on the test set</span>
+<span class="k">val</span> <span class="n">predictionAndLabels</span> <span class="k">=</span> <span class="n">test</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="k">case</span> <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">label</span><span class="o">,</span> <span class="n">features</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">features</span><span class="o">)</span>
+ <span class="o">(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">label</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Instantiate metrics object</span>
+<span class="k">val</span> <span class="n">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">BinaryClassificationMetrics</span><span class="o">(</span><span class="n">predictionAndLabels</span><span class="o">)</span>
+
+<span class="c1">// Precision by threshold</span>
+<span class="k">val</span> <span class="n">precision</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precisionByThreshold</span>
+<span class="n">precision</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">t</span><span class="o">,</span> <span class="n">p</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Threshold: $t, Precision: $p&quot;</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Recall by threshold</span>
+<span class="k">val</span> <span class="n">recall</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precisionByThreshold</span>
+<span class="n">recall</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">t</span><span class="o">,</span> <span class="n">r</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Threshold: $t, Recall: $r&quot;</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Precision-Recall Curve</span>
+<span class="k">val</span> <span class="nc">PRC</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">pr</span>
+
+<span class="c1">// F-measure</span>
+<span class="k">val</span> <span class="n">f1Score</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">fMeasureByThreshold</span>
+<span class="n">f1Score</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">t</span><span class="o">,</span> <span class="n">f</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Threshold: $t, F-score: $f, Beta = 1&quot;</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="k">val</span> <span class="n">beta</span> <span class="k">=</span> <span class="mf">0.5</span>
+<span class="k">val</span> <span class="n">fScore</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">fMeasureByThreshold</span><span class="o">(</span><span class="n">beta</span><span class="o">)</span>
+<span class="n">f1Score</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">t</span><span class="o">,</span> <span class="n">f</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Threshold: $t, F-score: $f, Beta = 0.5&quot;</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// AUPRC</span>
+<span class="k">val</span> <span class="n">auPRC</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">areaUnderPR</span>
+<span class="n">println</span><span class="o">(</span><span class="s">&quot;Area under precision-recall curve = &quot;</span> <span class="o">+</span> <span class="n">auPRC</span><span class="o">)</span>
+
+<span class="c1">// Compute thresholds used in ROC and PR curves</span>
+<span class="k">val</span> <span class="n">thresholds</span> <span class="k">=</span> <span class="n">precision</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">_1</span><span class="o">)</span>
+
+<span class="c1">// ROC Curve</span>
+<span class="k">val</span> <span class="n">roc</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">roc</span>
+
+<span class="c1">// AUROC</span>
+<span class="k">val</span> <span class="n">auROC</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">areaUnderROC</span>
+<span class="n">println</span><span class="o">(</span><span class="s">&quot;Area under ROC = &quot;</span> <span class="o">+</span> <span class="n">auROC</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.rdd.RDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.BinaryClassificationMetrics</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkConf</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkContext</span><span class="o">;</span>
+
+<span class="kd">public</span> <span class="kd">class</span> <span class="nc">BinaryClassification</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="kd">static</span> <span class="kt">void</span> <span class="nf">main</span><span class="o">(</span><span class="n">String</span><span class="o">[]</span> <span class="n">args</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">SparkConf</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkConf</span><span class="o">().</span><span class="na">setAppName</span><span class="o">(</span><span class="s">&quot;Binary Classification Metrics&quot;</span><span class="o">);</span>
+ <span class="n">SparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkContext</span><span class="o">(</span><span class="n">conf</span><span class="o">);</span>
+ <span class="n">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">&quot;data/mllib/sample_binary_classification_data.txt&quot;</span><span class="o">;</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="n">path</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+
+ <span class="c1">// Split initial RDD into two... [60% training data, 40% testing data].</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;[]</span> <span class="n">splits</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">randomSplit</span><span class="o">(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]</span> <span class="o">{</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">},</span> <span class="mi">11L</span><span class="o">);</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">training</span> <span class="o">=</span> <span class="n">splits</span><span class="o">[</span><span class="mi">0</span><span class="o">].</span><span class="na">cache</span><span class="o">();</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">test</span> <span class="o">=</span> <span class="n">splits</span><span class="o">[</span><span class="mi">1</span><span class="o">];</span>
+
+ <span class="c1">// Run training algorithm to build the model.</span>
+ <span class="kd">final</span> <span class="n">LogisticRegressionModel</span> <span class="n">model</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">LogisticRegressionWithLBFGS</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setNumClasses</span><span class="o">(</span><span class="mi">2</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">run</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Clear the prediction threshold so the model will return probabilities</span>
+ <span class="n">model</span><span class="o">.</span><span class="na">clearThreshold</span><span class="o">();</span>
+
+ <span class="c1">// Compute raw scores on the test set.</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">test</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">p</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">Double</span> <span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="na">features</span><span class="o">());</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">p</span><span class="o">.</span><span class="na">label</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// Get evaluation metrics.</span>
+ <span class="n">BinaryClassificationMetrics</span> <span class="n">metrics</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">BinaryClassificationMetrics</span><span class="o">(</span><span class="n">predictionAndLabels</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Precision by threshold</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">precision</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">precisionByThreshold</span><span class="o">().</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Precision by threshold: &quot;</span> <span class="o">+</span> <span class="n">precision</span><span class="o">.</span><span class="na">toArray</span><span class="o">());</span>
+
+ <span class="c1">// Recall by threshold</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">recall</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">recallByThreshold</span><span class="o">().</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Recall by threshold: &quot;</span> <span class="o">+</span> <span class="n">recall</span><span class="o">.</span><span class="na">toArray</span><span class="o">());</span>
+
+ <span class="c1">// F Score by threshold</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">f1Score</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">fMeasureByThreshold</span><span class="o">().</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;F1 Score by threshold: &quot;</span> <span class="o">+</span> <span class="n">f1Score</span><span class="o">.</span><span class="na">toArray</span><span class="o">());</span>
+
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">f2Score</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">fMeasureByThreshold</span><span class="o">(</span><span class="mf">2.0</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;F2 Score by threshold: &quot;</span> <span class="o">+</span> <span class="n">f2Score</span><span class="o">.</span><span class="na">toArray</span><span class="o">());</span>
+
+ <span class="c1">// Precision-recall curve</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">prc</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">pr</span><span class="o">().</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Precision-recall curve: &quot;</span> <span class="o">+</span> <span class="n">prc</span><span class="o">.</span><span class="na">toArray</span><span class="o">());</span>
+
+ <span class="c1">// Thresholds</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Double</span><span class="o">&gt;</span> <span class="n">thresholds</span> <span class="o">=</span> <span class="n">precision</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;,</span> <span class="n">Double</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Double</span> <span class="nf">call</span> <span class="o">(</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="n">t</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="nf">Double</span><span class="o">(</span><span class="n">t</span><span class="o">.</span><span class="na">_1</span><span class="o">().</span><span class="na">toString</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// ROC Curve</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">roc</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">roc</span><span class="o">().</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;ROC curve: &quot;</span> <span class="o">+</span> <span class="n">roc</span><span class="o">.</span><span class="na">toArray</span><span class="o">());</span>
+
+ <span class="c1">// AUPRC</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Area under precision-recall curve = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="na">areaUnderPR</span><span class="o">());</span>
+
+ <span class="c1">// AUROC</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Area under ROC = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="na">areaUnderROC</span><span class="o">());</span>
+
+ <span class="c1">// Save and load model</span>
+ <span class="n">model</span><span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;myModelPath&quot;</span><span class="o">);</span>
+ <span class="n">LogisticRegressionModel</span> <span class="n">sameModel</span> <span class="o">=</span> <span class="n">LogisticRegressionModel</span><span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;myModelPath&quot;</span><span class="o">);</span>
+ <span class="o">}</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.classification</span> <span class="kn">import</span> <span class="n">LogisticRegressionWithLBFGS</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.evaluation</span> <span class="kn">import</span> <span class="n">BinaryClassificationMetrics</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">LabeledPoint</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span>
+
+<span class="c"># Several of the methods available in scala are currently missing from pyspark</span>
+
+<span class="c"># Load training data in LIBSVM format</span>
+<span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">&quot;data/mllib/sample_binary_classification_data.txt&quot;</span><span class="p">)</span>
+
+<span class="c"># Split data into training (60%) and test (40%)</span>
+<span class="n">training</span><span class="p">,</span> <span class="n">test</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">randomSplit</span><span class="p">([</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">0.4</span><span class="p">],</span> <span class="n">seed</span> <span class="o">=</span> <span class="il">11L</span><span class="p">)</span>
+<span class="n">training</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
+
+<span class="c"># Run training algorithm to build the model</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">LogisticRegressionWithLBFGS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">training</span><span class="p">)</span>
+
+<span class="c"># Compute raw scores on the test set</span>
+<span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">test</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">lp</span><span class="p">:</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">lp</span><span class="o">.</span><span class="n">features</span><span class="p">)),</span> <span class="n">lp</span><span class="o">.</span><span class="n">label</span><span class="p">))</span>
+
+<span class="c"># Instantiate metrics object</span>
+<span class="n">metrics</span> <span class="o">=</span> <span class="n">BinaryClassificationMetrics</span><span class="p">(</span><span class="n">predictionAndLabels</span><span class="p">)</span>
+
+<span class="c"># Area under precision-recall curve</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Area under PR = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">areaUnderPR</span><span class="p">)</span>
+
+<span class="c"># Area under ROC curve</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Area under ROC = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">areaUnderROC</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h3 id="multiclass-classification">Multiclass classification</h3>
+
+<p>A <a href="https://en.wikipedia.org/wiki/Multiclass_classification">multiclass classification</a> describes a classification
+problem where there are $M \gt 2$ possible labels for each data point (the case where $M=2$ is the binary
+classification problem). For example, classifying handwriting samples to the digits 0 to 9, having 10 possible classes.</p>
+
+<p>For multiclass metrics, the notion of positives and negatives is slightly different. Predictions and labels can still
+be positive or negative, but they must be considered under the context of a particular class. Each label and prediction
+take on the value of one of the multiple classes and so they are said to be positive for their particular class and negative
+for all other classes. So, a true positive occurs whenever the prediction and the label match, while a true negative
+occurs when neither the prediction nor the label take on the value of a given class. By this convention, there can be
+multiple true negatives for a given data sample. The extension of false negatives and false positives from the former
+definitions of positive and negative labels is straightforward.</p>
+
+<h4 id="label-based-metrics">Label based metrics</h4>
+
+<p>Opposed to binary classification where there are only two possible labels, multiclass classification problems have many
+possible labels and so the concept of label-based metrics is introduced. Overall precision measures precision across all
+labels - the number of times any class was predicted correctly (true positives) normalized by the number of data
+points. Precision by label considers only one class, and measures the number of time a specific label was predicted
+correctly normalized by the number of times that label appears in the output.</p>
+
+<p><strong>Available metrics</strong></p>
+
+<p>Define the class, or label, set as</p>
+
+<script type="math/tex; mode=display">L = \{\ell_0, \ell_1, \ldots, \ell_{M-1} \} </script>
+
+<p>The true output vector $\mathbf{y}$ consists of $N$ elements</p>
+
+<script type="math/tex; mode=display">\mathbf{y}_0, \mathbf{y}_1, \ldots, \mathbf{y}_{N-1} \in L </script>
+
+<p>A multiclass prediction algorithm generates a prediction vector $\hat{\mathbf{y}}$ of $N$ elements</p>
+
+<script type="math/tex; mode=display">\hat{\mathbf{y}}_0, \hat{\mathbf{y}}_1, \ldots, \hat{\mathbf{y}}_{N-1} \in L </script>
+
+<p>For this section, a modified delta function $\hat{\delta}(x)$ will prove useful</p>
+
+<script type="math/tex; mode=display">% <![CDATA[
+\hat{\delta}(x) = \begin{cases}1 & \text{if $x = 0$}, \\ 0 & \text{otherwise}.\end{cases} %]]></script>
+
+<table class="table">
+ <thead>
+ <tr><th>Metric</th><th>Definition</th></tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Confusion Matrix</td>
+ <td>
+ $C_{ij} = \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_i) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_j)\\ \\
+ \left( \begin{array}{ccc}
+ \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_1) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_1) &amp; \ldots &amp;
+ \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_1) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_N) \\
+ \vdots &amp; \ddots &amp; \vdots \\
+ \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_N) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_1) &amp; \ldots &amp;
+ \sum_{k=0}^{N-1} \hat{\delta}(\mathbf{y}_k-\ell_N) \cdot \hat{\delta}(\hat{\mathbf{y}}_k - \ell_N)
+ \end{array} \right)$
+ </td>
+ </tr>
+ <tr>
+ <td>Overall Precision</td>
+ <td>$PPV = \frac{TP}{TP + FP} = \frac{1}{N}\sum_{i=0}^{N-1} \hat{\delta}\left(\hat{\mathbf{y}}_i -
+ \mathbf{y}_i\right)$</td>
+ </tr>
+ <tr>
+ <td>Overall Recall</td>
+ <td>$TPR = \frac{TP}{TP + FN} = \frac{1}{N}\sum_{i=0}^{N-1} \hat{\delta}\left(\hat{\mathbf{y}}_i -
+ \mathbf{y}_i\right)$</td>
+ </tr>
+ <tr>
+ <td>Overall F1-measure</td>
+ <td>$F1 = 2 \cdot \left(\frac{PPV \cdot TPR}
+ {PPV + TPR}\right)$</td>
+ </tr>
+ <tr>
+ <td>Precision by label</td>
+ <td>$PPV(\ell) = \frac{TP}{TP + FP} =
+ \frac{\sum_{i=0}^{N-1} \hat{\delta}(\hat{\mathbf{y}}_i - \ell) \cdot \hat{\delta}(\mathbf{y}_i - \ell)}
+ {\sum_{i=0}^{N-1} \hat{\delta}(\hat{\mathbf{y}}_i - \ell)}$</td>
+ </tr>
+ <tr>
+ <td>Recall by label</td>
+ <td>$TPR(\ell)=\frac{TP}{P} =
+ \frac{\sum_{i=0}^{N-1} \hat{\delta}(\hat{\mathbf{y}}_i - \ell) \cdot \hat{\delta}(\mathbf{y}_i - \ell)}
+ {\sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i - \ell)}$</td>
+ </tr>
+ <tr>
+ <td>F-measure by label</td>
+ <td>$F(\beta, \ell) = \left(1 + \beta^2\right) \cdot \left(\frac{PPV(\ell) \cdot TPR(\ell)}
+ {\beta^2 \cdot PPV(\ell) + TPR(\ell)}\right)$</td>
+ </tr>
+ <tr>
+ <td>Weighted precision</td>
+ <td>$PPV_{w}= \frac{1}{N} \sum\nolimits_{\ell \in L} PPV(\ell)
+ \cdot \sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i-\ell)$</td>
+ </tr>
+ <tr>
+ <td>Weighted recall</td>
+ <td>$TPR_{w}= \frac{1}{N} \sum\nolimits_{\ell \in L} TPR(\ell)
+ \cdot \sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i-\ell)$</td>
+ </tr>
+ <tr>
+ <td>Weighted F-measure</td>
+ <td>$F_{w}(\beta)= \frac{1}{N} \sum\nolimits_{\ell \in L} F(\beta, \ell)
+ \cdot \sum_{i=0}^{N-1} \hat{\delta}(\mathbf{y}_i-\ell)$</td>
+ </tr>
+ </tbody>
+</table>
+
+<p><strong>Examples</strong></p>
+
+<div class="codetabs">
+The following code snippets illustrate how to load a sample dataset, train a multiclass classification algorithm on
+the data, and evaluate the performance of the algorithm by several multiclass classification evaluation metrics.
+
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.MulticlassMetrics</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="c1">// Load training data in LIBSVM format</span>
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_multiclass_classification_data.txt&quot;</span><span class="o">)</span>
+
+<span class="c1">// Split data into training (60%) and test (40%)</span>
+<span class="k">val</span> <span class="nc">Array</span><span class="o">(</span><span class="n">training</span><span class="o">,</span> <span class="n">test</span><span class="o">)</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">randomSplit</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">),</span> <span class="n">seed</span> <span class="k">=</span> <span class="mi">11L</span><span class="o">)</span>
+<span class="n">training</span><span class="o">.</span><span class="n">cache</span><span class="o">()</span>
+
+<span class="c1">// Run training algorithm to build the model</span>
+<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegressionWithLBFGS</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setNumClasses</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">run</span><span class="o">(</span><span class="n">training</span><span class="o">)</span>
+
+<span class="c1">// Compute raw scores on the test set</span>
+<span class="k">val</span> <span class="n">predictionAndLabels</span> <span class="k">=</span> <span class="n">test</span><span class="o">.</span><span class="n">map</span> <span class="o">{</span> <span class="k">case</span> <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">label</span><span class="o">,</span> <span class="n">features</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">features</span><span class="o">)</span>
+ <span class="o">(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">label</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Instantiate metrics object</span>
+<span class="k">val</span> <span class="n">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">MulticlassMetrics</span><span class="o">(</span><span class="n">predictionAndLabels</span><span class="o">)</span>
+
+<span class="c1">// Confusion matrix</span>
+<span class="n">println</span><span class="o">(</span><span class="s">&quot;Confusion matrix:&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="n">confusionMatrix</span><span class="o">)</span>
+
+<span class="c1">// Overall Statistics</span>
+<span class="k">val</span> <span class="n">precision</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precision</span>
+<span class="k">val</span> <span class="n">recall</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">recall</span> <span class="c1">// same as true positive rate</span>
+<span class="k">val</span> <span class="n">f1Score</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">fMeasure</span>
+<span class="n">println</span><span class="o">(</span><span class="s">&quot;Summary Statistics&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Precision = $precision&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Recall = $recall&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;F1 Score = $f1Score&quot;</span><span class="o">)</span>
+
+<span class="c1">// Precision by label</span>
+<span class="k">val</span> <span class="n">labels</span> <span class="k">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">labels</span>
+<span class="n">labels</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="n">l</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Precision($l) = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precision</span><span class="o">(</span><span class="n">l</span><span class="o">))</span>
+<span class="o">}</span>
+
+<span class="c1">// Recall by label</span>
+<span class="n">labels</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="n">l</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Recall($l) = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="n">recall</span><span class="o">(</span><span class="n">l</span><span class="o">))</span>
+<span class="o">}</span>
+
+<span class="c1">// False positive rate by label</span>
+<span class="n">labels</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="n">l</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;FPR($l) = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="n">falsePositiveRate</span><span class="o">(</span><span class="n">l</span><span class="o">))</span>
+<span class="o">}</span>
+
+<span class="c1">// F-measure by label</span>
+<span class="n">labels</span><span class="o">.</span><span class="n">foreach</span> <span class="o">{</span> <span class="n">l</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;F1-Score($l) = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="n">fMeasure</span><span class="o">(</span><span class="n">l</span><span class="o">))</span>
+<span class="o">}</span>
+
+<span class="c1">// Weighted stats</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Weighted precision: ${metrics.weightedPrecision}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Weighted recall: ${metrics.weightedRecall}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Weighted F1 score: ${metrics.weightedFMeasure}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Weighted false positive rate: ${metrics.weightedFalsePositiveRate}&quot;</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.rdd.RDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.MulticlassMetrics</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Matrix</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkConf</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkContext</span><span class="o">;</span>
+
+<span class="kd">public</span> <span class="kd">class</span> <span class="nc">MulticlassClassification</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="kd">static</span> <span class="kt">void</span> <span class="nf">main</span><span class="o">(</span><span class="n">String</span><span class="o">[]</span> <span class="n">args</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">SparkConf</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkConf</span><span class="o">().</span><span class="na">setAppName</span><span class="o">(</span><span class="s">&quot;Multiclass Classification Metrics&quot;</span><span class="o">);</span>
+ <span class="n">SparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkContext</span><span class="o">(</span><span class="n">conf</span><span class="o">);</span>
+ <span class="n">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">&quot;data/mllib/sample_multiclass_classification_data.txt&quot;</span><span class="o">;</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="n">path</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+
+ <span class="c1">// Split initial RDD into two... [60% training data, 40% testing data].</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;[]</span> <span class="n">splits</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">randomSplit</span><span class="o">(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]</span> <span class="o">{</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">},</span> <span class="mi">11L</span><span class="o">);</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">training</span> <span class="o">=</span> <span class="n">splits</span><span class="o">[</span><span class="mi">0</span><span class="o">].</span><span class="na">cache</span><span class="o">();</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">test</span> <span class="o">=</span> <span class="n">splits</span><span class="o">[</span><span class="mi">1</span><span class="o">];</span>
+
+ <span class="c1">// Run training algorithm to build the model.</span>
+ <span class="kd">final</span> <span class="n">LogisticRegressionModel</span> <span class="n">model</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">LogisticRegressionWithLBFGS</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setNumClasses</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">run</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Compute raw scores on the test set.</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">test</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">p</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">Double</span> <span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="na">features</span><span class="o">());</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">p</span><span class="o">.</span><span class="na">label</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// Get evaluation metrics.</span>
+ <span class="n">MulticlassMetrics</span> <span class="n">metrics</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">MulticlassMetrics</span><span class="o">(</span><span class="n">predictionAndLabels</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Confusion matrix</span>
+ <span class="n">Matrix</span> <span class="n">confusion</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">confusionMatrix</span><span class="o">();</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Confusion matrix: \n&quot;</span> <span class="o">+</span> <span class="n">confusion</span><span class="o">);</span>
+
+ <span class="c1">// Overall statistics</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Precision = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="na">precision</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;Recall = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="na">recall</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">&quot;F1 Score = &quot;</span> <span class="o">+</span> <span class="n">metrics</span><span class="o">.</span><span class="na">fMeasure</span><span class="o">());</span>
+
+ <span class="c1">// Stats by labels</span>
+ <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">().</span><span class="na">length</span><span class="o">;</span> <span class="n">i</span><span class="o">++)</span> <span class="o">{</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Class %f precision = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">],</span> <span class="n">metrics</span><span class="o">.</span><span class="na">precision</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">]));</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Class %f recall = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">],</span> <span class="n">metrics</span><span class="o">.</span><span class="na">recall</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">]));</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Class %f F1 score = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">],</span> <span class="n">metrics</span><span class="o">.</span><span class="na">fMeasure</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">]));</span>
+ <span class="o">}</span>
+
+ <span class="c1">//Weighted stats</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Weighted precision = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">weightedPrecision</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Weighted recall = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">weightedRecall</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Weighted F1 score = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">weightedFMeasure</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Weighted false positive rate = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">weightedFalsePositiveRate</span><span class="o">());</span>
+
+ <span class="c1">// Save and load model</span>
+ <span class="n">model</span><span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;myModelPath&quot;</span><span class="o">);</span>
+ <span class="n">LogisticRegressionModel</span> <span class="n">sameModel</span> <span class="o">=</span> <span class="n">LogisticRegressionModel</span><span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;myModelPath&quot;</span><span class="o">);</span>
+ <span class="o">}</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.classification</span> <span class="kn">import</span> <span class="n">LogisticRegressionWithLBFGS</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.evaluation</span> <span class="kn">import</span> <span class="n">MulticlassMetrics</span>
+
+<span class="c"># Load training data in LIBSVM format</span>
+<span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">&quot;data/mllib/sample_multiclass_classification_data.txt&quot;</span><span class="p">)</span>
+
+<span class="c"># Split data into training (60%) and test (40%)</span>
+<span class="n">training</span><span class="p">,</span> <span class="n">test</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">randomSplit</span><span class="p">([</span><span class="mf">0.6</span><span class="p">,</span> <span class="mf">0.4</span><span class="p">],</span> <span class="n">seed</span> <span class="o">=</span> <span class="il">11L</span><span class="p">)</span>
+<span class="n">training</span><span class="o">.</span><span class="n">cache</span><span class="p">()</span>
+
+<span class="c"># Run training algorithm to build the model</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">LogisticRegressionWithLBFGS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">training</span><span class="p">,</span> <span class="n">numClasses</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
+
+<span class="c"># Compute raw scores on the test set</span>
+<span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">test</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">lp</span><span class="p">:</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">lp</span><span class="o">.</span><span class="n">features</span><span class="p">)),</span> <span class="n">lp</span><span class="o">.</span><span class="n">label</span><span class="p">))</span>
+
+<span class="c"># Instantiate metrics object</span>
+<span class="n">metrics</span> <span class="o">=</span> <span class="n">MulticlassMetrics</span><span class="p">(</span><span class="n">predictionAndLabels</span><span class="p">)</span>
+
+<span class="c"># Overall statistics</span>
+<span class="n">precision</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precision</span><span class="p">()</span>
+<span class="n">recall</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">recall</span><span class="p">()</span>
+<span class="n">f1Score</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="n">fMeasure</span><span class="p">()</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Summary Stats&quot;</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Precision = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">precision</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Recall = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">recall</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;F1 Score = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">f1Score</span><span class="p">)</span>
+
+<span class="c"># Statistics by class</span>
+<span class="n">labels</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">lp</span><span class="p">:</span> <span class="n">lp</span><span class="o">.</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
+<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">labels</span><span class="p">):</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&quot;Class </span><span class="si">%s</span><span class="s"> precision = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precision</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&quot;Class </span><span class="si">%s</span><span class="s"> recall = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">metrics</span><span class="o">.</span><span class="n">recall</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&quot;Class </span><span class="si">%s</span><span class="s"> F1 Measure = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">metrics</span><span class="o">.</span><span class="n">fMeasure</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">beta</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)))</span>
+
+<span class="c"># Weighted stats</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Weighted recall = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">weightedRecall</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Weighted precision = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">weightedPrecision</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Weighted F(1) Score = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">weightedFMeasure</span><span class="p">())</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Weighted F(0.5) Score = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">weightedFMeasure</span><span class="p">(</span><span class="n">beta</span><span class="o">=</span><span class="mf">0.5</span><span class="p">))</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Weighted false positive rate = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">weightedFalsePositiveRate</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h3 id="multilabel-classification">Multilabel classification</h3>
+
+<p>A <a href="https://en.wikipedia.org/wiki/Multi-label_classification">multilabel classification</a> problem involves mapping
+each sample in a dataset to a set of class labels. In this type of classification problem, the labels are not
+mutually exclusive. For example, when classifying a set of news articles into topics, a single article might be both
+science and politics.</p>
+
+<p>Because the labels are not mutually exclusive, the predictions and true labels are now vectors of label <em>sets</em>, rather
+than vectors of labels. Multilabel metrics, therefore, extend the fundamental ideas of precision, recall, etc. to
+operations on sets. For example, a true positive for a given class now occurs when that class exists in the predicted
+set and it exists in the true label set, for a specific data point.</p>
+
+<p><strong>Available metrics</strong></p>
+
+<p>Here we define a set $D$ of $N$ documents</p>
+
+<script type="math/tex; mode=display">D = \left\{d_0, d_1, ..., d_{N-1}\right\}</script>
+
+<p>Define $L_0, L_1, &#8230;, L<em>{N-1}$ to be a family of label sets and $P_0, P_1, &#8230;, P</em>{N-1}$
+to be a family of prediction sets where $L_i$ and $P_i$ are the label set and prediction set, respectively, that
+correspond to document $d_i$.</p>
+
+<p>The set of all unique labels is given by</p>
+
+<script type="math/tex; mode=display">L = \bigcup_{k=0}^{N-1} L_k</script>
+
+<p>The following definition of indicator function $I_A(x)$ on a set $A$ will be necessary</p>
+
+<script type="math/tex; mode=display">% <![CDATA[
+I_A(x) = \begin{cases}1 & \text{if $x \in A$}, \\ 0 & \text{otherwise}.\end{cases} %]]></script>
+
+<table class="table">
+ <thead>
+ <tr><th>Metric</th><th>Definition</th></tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Precision</td><td>$\frac{1}{N} \sum_{i=0}^{N-1} \frac{\left|P_i \cap L_i\right|}{\left|P_i\right|}$</td>
+ </tr>
+ <tr>
+ <td>Recall</td><td>$\frac{1}{N} \sum_{i=0}^{N-1} \frac{\left|L_i \cap P_i\right|}{\left|L_i\right|}$</td>
+ </tr>
+ <tr>
+ <td>Accuracy</td>
+ <td>
+ $\frac{1}{N} \sum_{i=0}^{N - 1} \frac{\left|L_i \cap P_i \right|}
+ {\left|L_i\right| + \left|P_i\right| - \left|L_i \cap P_i \right|}$
+ </td>
+ </tr>
+ <tr>
+ <td>Precision by label</td><td>$PPV(\ell)=\frac{TP}{TP + FP}=
+ \frac{\sum_{i=0}^{N-1} I_{P_i}(\ell) \cdot I_{L_i}(\ell)}
+ {\sum_{i=0}^{N-1} I_{P_i}(\ell)}$</td>
+ </tr>
+ <tr>
+ <td>Recall by label</td><td>$TPR(\ell)=\frac{TP}{P}=
+ \frac{\sum_{i=0}^{N-1} I_{P_i}(\ell) \cdot I_{L_i}(\ell)}
+ {\sum_{i=0}^{N-1} I_{L_i}(\ell)}$</td>
+ </tr>
+ <tr>
+ <td>F1-measure by label</td><td>$F1(\ell) = 2
+ \cdot \left(\frac{PPV(\ell) \cdot TPR(\ell)}
+ {PPV(\ell) + TPR(\ell)}\right)$</td>
+ </tr>
+ <tr>
+ <td>Hamming Loss</td>
+ <td>
+ $\frac{1}{N \cdot \left|L\right|} \sum_{i=0}^{N - 1} \left|L_i\right| + \left|P_i\right| - 2\left|L_i
+ \cap P_i\right|$
+ </td>
+ </tr>
+ <tr>
+ <td>Subset Accuracy</td>
+ <td>$\frac{1}{N} \sum_{i=0}^{N-1} I_{\{L_i\}}(P_i)$</td>
+ </tr>
+ <tr>
+ <td>F1 Measure</td>
+ <td>$\frac{1}{N} \sum_{i=0}^{N-1} 2 \frac{\left|P_i \cap L_i\right|}{\left|P_i\right| \cdot \left|L_i\right|}$</td>
+ </tr>
+ <tr>
+ <td>Micro precision</td>
+ <td>$\frac{TP}{TP + FP}=\frac{\sum_{i=0}^{N-1} \left|P_i \cap L_i\right|}
+ {\sum_{i=0}^{N-1} \left|P_i \cap L_i\right| + \sum_{i=0}^{N-1} \left|P_i - L_i\right|}$</td>
+ </tr>
+ <tr>
+ <td>Micro recall</td>
+ <td>$\frac{TP}{TP + FN}=\frac{\sum_{i=0}^{N-1} \left|P_i \cap L_i\right|}
+ {\sum_{i=0}^{N-1} \left|P_i \cap L_i\right| + \sum_{i=0}^{N-1} \left|L_i - P_i\right|}$</td>
+ </tr>
+ <tr>
+ <td>Micro F1 Measure</td>
+ <td>
+ $2 \cdot \frac{TP}{2 \cdot TP + FP + FN}=2 \cdot \frac{\sum_{i=0}^{N-1} \left|P_i \cap L_i\right|}{2 \cdot
+ \sum_{i=0}^{N-1} \left|P_i \cap L_i\right| + \sum_{i=0}^{N-1} \left|L_i - P_i\right| + \sum_{i=0}^{N-1}
+ \left|P_i - L_i\right|}$
+ </td>
+ </tr>
+ </tbody>
+</table>
+
+<p><strong>Examples</strong></p>
+
+<p>The following code snippets illustrate how to evaluate the performance of a multilabel classifer. The examples
+use the fake prediction and label data for multilabel classification that is shown below.</p>
+
+<p>Document predictions:</p>
+
+<ul>
+ <li>doc 0 - predict 0, 1 - class 0, 2</li>
+ <li>doc 1 - predict 0, 2 - class 0, 1</li>
+ <li>doc 2 - predict none - class 0</li>
+ <li>doc 3 - predict 2 - class 2</li>
+ <li>doc 4 - predict 2, 0 - class 2, 0</li>
+ <li>doc 5 - predict 0, 1, 2 - class 0, 1</li>
+ <li>doc 6 - predict 1 - class 1, 2</li>
+</ul>
+
+<p>Predicted classes:</p>
+
+<ul>
+ <li>class 0 - doc 0, 1, 4, 5 (total 4)</li>
+ <li>class 1 - doc 0, 5, 6 (total 3)</li>
+ <li>class 2 - doc 1, 3, 4, 5 (total 4)</li>
+</ul>
+
+<p>True classes:</p>
+
+<ul>
+ <li>class 0 - doc 0, 1, 2, 4, 5 (total 5)</li>
+ <li>class 1 - doc 1, 5, 6 (total 3)</li>
+ <li>class 2 - doc 0, 3, 4, 6 (total 4)</li>
+</ul>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.MultilabelMetrics</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.rdd.RDD</span><span class="o">;</span>
+
+<span class="k">val</span> <span class="n">scoreAndLabels</span><span class="k">:</span> <span class="kt">RDD</span><span class="o">[(</span><span class="kt">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">]</span>, <span class="kt">Array</span><span class="o">[</span><span class="kt">Double</span><span class="o">])]</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span>
+ <span class="nc">Seq</span><span class="o">((</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="nc">Array</span><span class="o">(),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">2.0</span><span class="o">),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">2.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">1.0</span><span class="o">),</span> <span class="nc">Array</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">))),</span> <span class="mi">2</span><span class="o">)</span>
+
+<span class="c1">// Instantiate metrics object</span>
+<span class="k">val</span> <span class="n">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">MultilabelMetrics</span><span class="o">(</span><span class="n">scoreAndLabels</span><span class="o">)</span>
+
+<span class="c1">// Summary stats</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Recall = ${metrics.recall}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Precision = ${metrics.precision}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;F1 measure = ${metrics.f1Measure}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Accuracy = ${metrics.accuracy}&quot;</span><span class="o">)</span>
+
+<span class="c1">// Individual label stats</span>
+<span class="n">metrics</span><span class="o">.</span><span class="n">labels</span><span class="o">.</span><span class="n">foreach</span><span class="o">(</span><span class="n">label</span> <span class="k">=&gt;</span> <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Class $label precision = ${metrics.precision(label)}&quot;</span><span class="o">))</span>
+<span class="n">metrics</span><span class="o">.</span><span class="n">labels</span><span class="o">.</span><span class="n">foreach</span><span class="o">(</span><span class="n">label</span> <span class="k">=&gt;</span> <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Class $label recall = ${metrics.recall(label)}&quot;</span><span class="o">))</span>
+<span class="n">metrics</span><span class="o">.</span><span class="n">labels</span><span class="o">.</span><span class="n">foreach</span><span class="o">(</span><span class="n">label</span> <span class="k">=&gt;</span> <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Class $label F1-score = ${metrics.f1Measure(label)}&quot;</span><span class="o">))</span>
+
+<span class="c1">// Micro stats</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Micro recall = ${metrics.microRecall}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Micro precision = ${metrics.microPrecision}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Micro F1 measure = ${metrics.microF1Measure}&quot;</span><span class="o">)</span>
+
+<span class="c1">// Hamming loss</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Hamming loss = ${metrics.hammingLoss}&quot;</span><span class="o">)</span>
+
+<span class="c1">// Subset accuracy</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Subset accuracy = ${metrics.subsetAccuracy}&quot;</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.rdd.RDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.MultilabelMetrics</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkConf</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span>
+
+<span class="kd">public</span> <span class="kd">class</span> <span class="nc">MultilabelClassification</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="kd">static</span> <span class="kt">void</span> <span class="nf">main</span><span class="o">(</span><span class="n">String</span><span class="o">[]</span> <span class="n">args</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">SparkConf</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkConf</span><span class="o">().</span><span class="na">setAppName</span><span class="o">(</span><span class="s">&quot;Multilabel Classification Metrics&quot;</span><span class="o">);</span>
+ <span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">JavaSparkContext</span><span class="o">(</span><span class="n">conf</span><span class="o">);</span>
+
+ <span class="n">List</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">}),</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">}),</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">}),</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">2.0</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">2.0</span><span class="o">}),</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">}),</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">}),</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">1.0</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">})</span>
+ <span class="o">);</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="kt">double</span><span class="o">[],</span> <span class="kt">double</span><span class="o">[]&gt;&gt;</span> <span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">data</span><span class="o">);</span>
+
+ <span class="c1">// Instantiate metrics object</span>
+ <span class="n">MultilabelMetrics</span> <span class="n">metrics</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">MultilabelMetrics</span><span class="o">(</span><span class="n">scoreAndLabels</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Summary stats</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Recall = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">recall</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Precision = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">precision</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;F1 measure = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">f1Measure</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Accuracy = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">accuracy</span><span class="o">());</span>
+
+ <span class="c1">// Stats by labels</span>
+ <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">().</span><span class="na">length</span> <span class="o">-</span> <span class="mi">1</span><span class="o">;</span> <span class="n">i</span><span class="o">++)</span> <span class="o">{</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Class %1.1f precision = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">],</span> <span class="n">metrics</span><span class="o">.</span><span class="na">precision</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">]));</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Class %1.1f recall = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">],</span> <span class="n">metrics</span><span class="o">.</span><span class="na">recall</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">]));</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Class %1.1f F1 score = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">],</span> <span class="n">metrics</span><span class="o">.</span><span class="na">f1Measure</span><span class="o">(</span><span class="n">metrics</span><span class="o">.</span><span class="na">labels</span><span class="o">()[</span><span class="n">i</span><span class="o">]));</span>
+ <span class="o">}</span>
+
+ <span class="c1">// Micro stats</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Micro recall = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">microRecall</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Micro precision = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">microPrecision</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Micro F1 measure = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">microF1Measure</span><span class="o">());</span>
+
+ <span class="c1">// Hamming loss</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Hamming loss = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">hammingLoss</span><span class="o">());</span>
+
+ <span class="c1">// Subset accuracy</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Subset accuracy = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">subsetAccuracy</span><span class="o">());</span>
+
+ <span class="o">}</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.evaluation</span> <span class="kn">import</span> <span class="n">MultilabelMetrics</span>
+
+<span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span>
+ <span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),</span>
+ <span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span>
+ <span class="p">([],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">]),</span>
+ <span class="p">([</span><span class="mf">2.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">]),</span>
+ <span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span>
+ <span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">]),</span>
+ <span class="p">([</span><span class="mf">1.0</span><span class="p">],</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">])])</span>
+
+<span class="c"># Instantiate metrics object</span>
+<span class="n">metrics</span> <span class="o">=</span> <span class="n">MultilabelMetrics</span><span class="p">(</span><span class="n">scoreAndLabels</span><span class="p">)</span>
+
+<span class="c"># Summary stats</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Recall = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">recall</span><span class="p">())</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Precision = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precision</span><span class="p">())</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;F1 measure = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">f1Measure</span><span class="p">())</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Accuracy = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">accuracy</span><span class="p">)</span>
+
+<span class="c"># Individual label stats</span>
+<span class="n">labels</span> <span class="o">=</span> <span class="n">scoreAndLabels</span><span class="o">.</span><span class="n">flatMap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
+<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span><span class="p">:</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&quot;Class </span><span class="si">%s</span><span class="s"> precision = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">metrics</span><span class="o">.</span><span class="n">precision</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&quot;Class </span><span class="si">%s</span><span class="s"> recall = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">metrics</span><span class="o">.</span><span class="n">recall</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span>
+ <span class="k">print</span><span class="p">(</span><span class="s">&quot;Class </span><span class="si">%s</span><span class="s"> F1 Measure = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">metrics</span><span class="o">.</span><span class="n">f1Measure</span><span class="p">(</span><span class="n">label</span><span class="p">)))</span>
+
+<span class="c"># Micro stats</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Micro precision = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">microPrecision</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Micro recall = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">microRecall</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Micro F1 measure = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">microF1Measure</span><span class="p">)</span>
+
+<span class="c"># Hamming loss</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Hamming loss = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">hammingLoss</span><span class="p">)</span>
+
+<span class="c"># Subset accuracy</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Subset accuracy = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">subsetAccuracy</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h3 id="ranking-systems">Ranking systems</h3>
+
+<p>The role of a ranking algorithm (often thought of as a <a href="https://en.wikipedia.org/wiki/Recommender_system">recommender system</a>)
+is to return to the user a set of relevant items or documents based on some training data. The definition of relevance
+may vary and is usually application specific. Ranking system metrics aim to quantify the effectiveness of these
+rankings or recommendations in various contexts. Some metrics compare a set of recommended documents to a ground truth
+set of relevant documents, while other metrics may incorporate numerical ratings explicitly.</p>
+
+<p><strong>Available metrics</strong></p>
+
+<p>A ranking system usually deals with a set of $M$ users</p>
+
+<script type="math/tex; mode=display">U = \left\{u_0, u_1, ..., u_{M-1}\right\}</script>
+
+<p>Each user ($u_i$) having a set of $N$ ground truth relevant documents</p>
+
+<script type="math/tex; mode=display">D_i = \left\{d_0, d_1, ..., d_{N-1}\right\}</script>
+
+<p>And a list of $Q$ recommended documents, in order of decreasing relevance</p>
+
+<script type="math/tex; mode=display">R_i = \left[r_0, r_1, ..., r_{Q-1}\right]</script>
+
+<p>The goal of the ranking system is to produce the most relevant set of documents for each user. The relevance of the
+sets and the effectiveness of the algorithms can be measured using the metrics listed below.</p>
+
+<p>It is necessary to define a function which, provided a recommended document and a set of ground truth relevant
+documents, returns a relevance score for the recommended document.</p>
+
+<script type="math/tex; mode=display">% <![CDATA[
+rel_D(r) = \begin{cases}1 & \text{if $r \in D$}, \\ 0 & \text{otherwise}.\end{cases} %]]></script>
+
+<table class="table">
+ <thead>
+ <tr><th>Metric</th><th>Definition</th><th>Notes</th></tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>
+ Precision at k
+ </td>
+ <td>
+ $p(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{k} \sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} rel_{D_i}(R_i(j))}$
+ </td>
+ <td>
+ <a href="https://en.wikipedia.org/wiki/Information_retrieval#Precision_at_K">Precision at k</a> is a measure of
+ how many of the first k recommended documents are in the set of true relevant documents averaged across all
+ users. In this metric, the order of the recommendations is not taken into account.
+ </td>
+ </tr>
+ <tr>
+ <td>Mean Average Precision</td>
+ <td>
+ $MAP=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{\left|D_i\right|} \sum_{j=0}^{Q-1} \frac{rel_{D_i}(R_i(j))}{j + 1}}$
+ </td>
+ <td>
+ <a href="https://en.wikipedia.org/wiki/Information_retrieval#Mean_average_precision">MAP</a> is a measure of how
+ many of the recommended documents are in the set of true relevant documents, where the
+ order of the recommendations is taken into account (i.e. penalty for highly relevant documents is higher).
+ </td>
+ </tr>
+ <tr>
+ <td>Normalized Discounted Cumulative Gain</td>
+ <td>
+ $NDCG(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{IDCG(D_i, k)}\sum_{j=0}^{n-1}
+ \frac{rel_{D_i}(R_i(j))}{\text{ln}(j+1)}} \\
+ \text{Where} \\
+ \hspace{5 mm} n = \text{min}\left(\text{max}\left(|R_i|,|D_i|\right),k\right) \\
+ \hspace{5 mm} IDCG(D, k) = \sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} \frac{1}{\text{ln}(j+1)}$
+ </td>
+ <td>
+ <a href="https://en.wikipedia.org/wiki/Information_retrieval#Discounted_cumulative_gain">NDCG at k</a> is a
+ measure of how many of the first k recommended documents are in the set of true relevant documents averaged
+ across all users. In contrast to precision at k, this metric takes into account the order of the recommendations
+ (documents are assumed to be in order of decreasing relevance).
+ </td>
+ </tr>
+ </tbody>
+</table>
+
+<p><strong>Examples</strong></p>
+
+<p>The following code snippets illustrate how to load a sample dataset, train an alternating least squares recommendation
+model on the data, and evaluate the performance of the recommender by several ranking metrics. A brief summary of the
+methodology is provided below.</p>
+
+<p>MovieLens ratings are on a scale of 1-5:</p>
+
+<ul>
+ <li>5: Must see</li>
+ <li>4: Will enjoy</li>
+ <li>3: It&#8217;s okay</li>
+ <li>2: Fairly bad</li>
+ <li>1: Awful</li>
+</ul>
+
+<p>So we should not recommend a movie if the predicted rating is less than 3.
+To map ratings to confidence scores, we use:</p>
+
+<ul>
+ <li>5 -&gt; 2.5</li>
+ <li>4 -&gt; 1.5</li>
+ <li>3 -&gt; 0.5</li>
+ <li>2 -&gt; -0.5</li>
+ <li>1 -&gt; -1.5.</li>
+</ul>
+
+<p>This mappings means unobserved entries are generally between It&#8217;s okay and Fairly bad. The semantics of 0 in this
+expanded world of non-positive weights are &#8220;the same as never having interacted at all.&#8221;</p>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.</span><span class="o">{</span><span class="nc">RegressionMetrics</span><span class="o">,</span> <span class="nc">RankingMetrics</span><span class="o">}</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.recommendation.</span><span class="o">{</span><span class="nc">ALS</span><span class="o">,</span> <span class="nc">Rating</span><span class="o">}</span>
+
+<span class="c1">// Read in the ratings data</span>
+<span class="k">val</span> <span class="n">ratings</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="o">(</span><span class="s">&quot;data/mllib/sample_movielens_data.txt&quot;</span><span class="o">).</span><span class="n">map</span> <span class="o">{</span> <span class="n">line</span> <span class="k">=&gt;</span>
+ <span class="k">val</span> <span class="n">fields</span> <span class="k">=</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot;::&quot;</span><span class="o">)</span>
+ <span class="nc">Rating</span><span class="o">(</span><span class="n">fields</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="n">toInt</span><span class="o">,</span> <span class="n">fields</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">toInt</span><span class="o">,</span> <span class="n">fields</span><span class="o">(</span><span class="mi">2</span><span class="o">).</span><span class="n">toDouble</span> <span class="o">-</span> <span class="mf">2.5</span><span class="o">)</span>
+<span class="o">}.</span><span class="n">cache</span><span class="o">()</span>
+
+<span class="c1">// Map ratings to 1 or 0, 1 indicating a movie that should be recommended</span>
+<span class="k">val</span> <span class="n">binarizedRatings</span> <span class="k">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">r</span> <span class="k">=&gt;</span> <span class="nc">Rating</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="o">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="o">,</span> <span class="k">if</span> <span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="n">rating</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="o">)</span> <span class="mf">1.0</span> <span class="k">else</span> <span class="mf">0.0</span><span class="o">)).</span><span class="n">cache</span><span class="o">()</span>
+
+<span class="c1">// Summarize ratings</span>
+<span class="k">val</span> <span class="n">numRatings</span> <span class="k">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">count</span><span class="o">()</span>
+<span class="k">val</span> <span class="n">numUsers</span> <span class="k">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">user</span><span class="o">).</span><span class="n">distinct</span><span class="o">().</span><span class="n">count</span><span class="o">()</span>
+<span class="k">val</span> <span class="n">numMovies</span> <span class="k">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">product</span><span class="o">).</span><span class="n">distinct</span><span class="o">().</span><span class="n">count</span><span class="o">()</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Got $numRatings ratings from $numUsers users on $numMovies movies.&quot;</span><span class="o">)</span>
+
+<span class="c1">// Build the model</span>
+<span class="k">val</span> <span class="n">numIterations</span> <span class="k">=</span> <span class="mi">10</span>
+<span class="k">val</span> <span class="n">rank</span> <span class="k">=</span> <span class="mi">10</span>
+<span class="k">val</span> <span class="n">lambda</span> <span class="k">=</span> <span class="mf">0.01</span>
+<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="nc">ALS</span><span class="o">.</span><span class="n">train</span><span class="o">(</span><span class="n">ratings</span><span class="o">,</span> <span class="n">rank</span><span class="o">,</span> <span class="n">numIterations</span><span class="o">,</span> <span class="n">lambda</span><span class="o">)</span>
+
+<span class="c1">// Define a function to scale ratings from 0 to 1</span>
+<span class="k">def</span> <span class="n">scaledRating</span><span class="o">(</span><span class="n">r</span><span class="k">:</span> <span class="kt">Rating</span><span class="o">)</span><span class="k">:</span> <span class="kt">Rating</span> <span class="o">=</span> <span class="o">{</span>
+ <span class="k">val</span> <span class="n">scaledRating</span> <span class="k">=</span> <span class="n">math</span><span class="o">.</span><span class="n">max</span><span class="o">(</span><span class="n">math</span><span class="o">.</span><span class="n">min</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="n">rating</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> <span class="mf">0.0</span><span class="o">)</span>
+ <span class="nc">Rating</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="o">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="o">,</span> <span class="n">scaledRating</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Get sorted top ten predictions for each user and then scale from [0, 1]</span>
+<span class="k">val</span> <span class="n">userRecommended</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">recommendProductsForUsers</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="n">map</span><span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">user</span><span class="o">,</span> <span class="n">recs</span><span class="o">)</span> <span class="k">=&gt;</span>
+ <span class="o">(</span><span class="n">user</span><span class="o">,</span> <span class="n">recs</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">scaledRating</span><span class="o">))</span>
+<span class="o">}</span>
+
+<span class="c1">// Assume that any movie a user rated 3 or higher (which maps to a 1) is a relevant document</span>
+<span class="c1">// Compare with top ten most relevant documents</span>
+<span class="k">val</span> <span class="n">userMovies</span> <span class="k">=</span> <span class="n">binarizedRatings</span><span class="o">.</span><span class="n">groupBy</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">user</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">relevantDocuments</span> <span class="k">=</span> <span class="n">userMovies</span><span class="o">.</span><span class="n">join</span><span class="o">(</span><span class="n">userRecommended</span><span class="o">).</span><span class="n">map</span><span class="o">{</span> <span class="k">case</span> <span class="o">(</span><span class="n">user</span><span class="o">,</span> <span class="o">(</span><span class="n">actual</span><span class="o">,</span> <span class="n">predictions</span><span class="o">))</span> <span class="k">=&gt;</span>
+ <span class="o">(</span><span class="n">predictions</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">product</span><span class="o">),</span> <span class="n">actual</span><span class="o">.</span><span class="n">filter</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">rating</span> <span class="o">&gt;</span> <span class="mf">0.0</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">product</span><span class="o">).</span><span class="n">toArray</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Instantiate metrics object</span>
+<span class="k">val</span> <span class="n">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RankingMetrics</span><span class="o">(</span><span class="n">relevantDocuments</span><span class="o">)</span>
+
+<span class="c1">// Precision at K</span>
+<span class="nc">Array</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mi">3</span><span class="o">,</span> <span class="mi">5</span><span class="o">).</span><span class="n">foreach</span><span class="o">{</span> <span class="n">k</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Precision at $k = ${metrics.precisionAt(k)}&quot;</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Mean average precision</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Mean average precision = ${metrics.meanAveragePrecision}&quot;</span><span class="o">)</span>
+
+<span class="c1">// Normalized discounted cumulative gain</span>
+<span class="nc">Array</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mi">3</span><span class="o">,</span> <span class="mi">5</span><span class="o">).</span><span class="n">foreach</span><span class="o">{</span> <span class="n">k</span> <span class="k">=&gt;</span>
+ <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;NDCG at $k = ${metrics.ndcgAt(k)}&quot;</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Get predictions for each data point</span>
+<span class="k">val</span> <span class="n">allPredictions</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">r</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="o">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="o">))).</span><span class="n">map</span><span class="o">(</span><span class="n">r</span> <span class="k">=&gt;</span> <span class="o">((</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="o">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="o">),</span> <span class="n">r</span><span class="o">.</span><span class="n">rating</span><span class="o">))</span>
+<span class="k">val</span> <span class="n">allRatings</span> <span class="k">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="n">r</span> <span class="k">=&gt;</span> <span class="o">((</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="o">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="o">),</span> <span class="n">r</span><span class="o">.</span><span class="n">rating</span><span class="o">))</span>
+<span class="k">val</span> <span class="n">predictionsAndLabels</span> <span class="k">=</span> <span class="n">allPredictions</span><span class="o">.</span><span class="n">join</span><span class="o">(</span><span class="n">allRatings</span><span class="o">).</span><span class="n">map</span><span class="o">{</span> <span class="k">case</span> <span class="o">((</span><span class="n">user</span><span class="o">,</span> <span class="n">product</span><span class="o">),</span> <span class="o">(</span><span class="n">predicted</span><span class="o">,</span> <span class="n">actual</span><span class="o">))</span> <span class="k">=&gt;</span>
+ <span class="o">(</span><span class="n">predicted</span><span class="o">,</span> <span class="n">actual</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Get the RMSE using regression metrics</span>
+<span class="k">val</span> <span class="n">regressionMetrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RegressionMetrics</span><span class="o">(</span><span class="n">predictionsAndLabels</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;RMSE = ${regressionMetrics.rootMeanSquaredError}&quot;</span><span class="o">)</span>
+
+<span class="c1">// R-squared</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;R-squared = ${regressionMetrics.r2}&quot;</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.rdd.RDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.recommendation.MatrixFactorizationModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkConf</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">java.util.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.RegressionMetrics</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.RankingMetrics</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.recommendation.ALS</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.recommendation.Rating</span><span class="o">;</span>
+
+<span class="c1">// Read in the ratings data</span>
+<span class="kd">public</span> <span class="kd">class</span> <span class="nc">Ranking</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="kd">static</span> <span class="kt">void</span> <span class="nf">main</span><span class="o">(</span><span class="n">String</span><span class="o">[]</span> <span class="n">args</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">SparkConf</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkConf</span><span class="o">().</span><span class="na">setAppName</span><span class="o">(</span><span class="s">&quot;Ranking Metrics&quot;</span><span class="o">);</span>
+ <span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">JavaSparkContext</span><span class="o">(</span><span class="n">conf</span><span class="o">);</span>
+ <span class="n">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">&quot;data/mllib/sample_movielens_data.txt&quot;</span><span class="o">;</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="n">path</span><span class="o">);</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">&gt;</span> <span class="n">ratings</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">Rating</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Rating</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">line</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">String</span><span class="o">[]</span> <span class="n">parts</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot;::&quot;</span><span class="o">);</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="nf">Rating</span><span class="o">(</span><span class="n">Integer</span><span class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">0</span><span class="o">]),</span> <span class="n">Integer</span><span class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">1</span><span class="o">]),</span> <span class="n">Double</span><span class="o">.</span><span class="na">parseDouble</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">2</span><span class="o">])</span> <span class="o">-</span> <span class="mf">2.5</span><span class="o">);</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+ <span class="n">ratings</span><span class="o">.</span><span class="na">cache</span><span class="o">();</span>
+
+ <span class="c1">// Train an ALS model</span>
+ <span class="kd">final</span> <span class="n">MatrixFactorizationModel</span> <span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="na">train</span><span class="o">(</span><span class="n">JavaRDD</span><span class="o">.</span><span class="na">toRDD</span><span class="o">(</span><span class="n">ratings</span><span class="o">),</span> <span class="mi">10</span><span class="o">,</span> <span class="mi">10</span><span class="o">,</span> <span class="mf">0.01</span><span class="o">);</span>
+
+ <span class="c1">// Get top 10 recommendations for every user and scale ratings from 0 to 1</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;&gt;</span> <span class="n">userRecs</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">recommendProductsForUsers</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;&gt;</span> <span class="n">userRecsScaled</span> <span class="o">=</span> <span class="n">userRecs</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;</span> <span class="n">t</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">Rating</span><span class="o">[]</span> <span class="n">scaledRatings</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Rating</span><span class="o">[</span><span class="n">t</span><span class="o">.</span><span class="na">_2</span><span class="o">().</span><span class="na">length</span><span class="o">];</span>
+ <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">scaledRatings</span><span class="o">.</span><span class="na">length</span><span class="o">;</span> <span class="n">i</span><span class="o">++)</span> <span class="o">{</span>
+ <span class="kt">double</span> <span class="n">newRating</span> <span class="o">=</span> <span class="n">Math</span><span class="o">.</span><span class="na">max</span><span class="o">(</span><span class="n">Math</span><span class="o">.</span><span class="na">min</span><span class="o">(</span><span class="n">t</span><span class="o">.</span><span class="na">_2</span><span class="o">()[</span><span class="n">i</span><span class="o">].</span><span class="na">rating</span><span class="o">(),</span> <span class="mf">1.0</span><span class="o">),</span> <span class="mf">0.0</span><span class="o">);</span>
+ <span class="n">scaledRatings</span><span class="o">[</span><span class="n">i</span><span class="o">]</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Rating</span><span class="o">(</span><span class="n">t</span><span class="o">.</span><span class="na">_2</span><span class="o">()[</span><span class="n">i</span><span class="o">].</span><span class="na">user</span><span class="o">(),</span> <span class="n">t</span><span class="o">.</span><span class="na">_2</span><span class="o">()[</span><span class="n">i</span><span class="o">].</span><span class="na">product</span><span class="o">(),</span> <span class="n">newRating</span><span class="o">);</span>
+ <span class="o">}</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;(</span><span class="n">t</span><span class="o">.</span><span class="na">_1</span><span class="o">(),</span> <span class="n">scaledRatings</span><span class="o">);</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+ <span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Rating</span><span class="o">[]&gt;</span> <span class="n">userRecommended</span> <span class="o">=</span> <span class="n">JavaPairRDD</span><span class="o">.</span><span class="na">fromJavaRDD</span><span class="o">(</span><span class="n">userRecsScaled</span><span class="o">);</span>
+
+ <span class="c1">// Map ratings to 1 or 0, 1 indicating a movie that should be recommended</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">&gt;</span> <span class="n">binarizedRatings</span> <span class="o">=</span> <span class="n">ratings</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">,</span> <span class="n">Rating</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Rating</span> <span class="nf">call</span><span class="o">(</span><span class="n">Rating</span> <span class="n">r</span><span class="o">)</span> <span class="o">{</span>
+ <span class="kt">double</span> <span class="n">binaryRating</span><span class="o">;</span>
+ <span class="k">if</span> <span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">rating</span><span class="o">()</span> <span class="o">&gt;</span> <span class="mf">0.0</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">binaryRating</span> <span class="o">=</span> <span class="mf">1.0</span><span class="o">;</span>
+ <span class="o">}</span>
+ <span class="k">else</span> <span class="o">{</span>
+ <span class="n">binaryRating</span> <span class="o">=</span> <span class="mf">0.0</span><span class="o">;</span>
+ <span class="o">}</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="nf">Rating</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">user</span><span class="o">(),</span> <span class="n">r</span><span class="o">.</span><span class="na">product</span><span class="o">(),</span> <span class="n">binaryRating</span><span class="o">);</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// Group ratings by common user</span>
+ <span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Iterable</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">&gt;&gt;</span> <span class="n">userMovies</span> <span class="o">=</span> <span class="n">binarizedRatings</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Object</span> <span class="nf">call</span><span class="o">(</span><span class="n">Rating</span> <span class="n">r</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="n">r</span><span class="o">.</span><span class="na">user</span><span class="o">();</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// Get true relevant documents from all user ratings</span>
+ <span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;&gt;</span> <span class="n">userMoviesList</span> <span class="o">=</span> <span class="n">userMovies</span><span class="o">.</span><span class="na">mapValues</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Iterable</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">&gt;,</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">Iterable</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">&gt;</span> <span class="n">docs</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span> <span class="n">products</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;();</span>
+ <span class="k">for</span> <span class="o">(</span><span class="n">Rating</span> <span class="n">r</span> <span class="o">:</span> <span class="n">docs</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">if</span> <span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">rating</span><span class="o">()</span> <span class="o">&gt;</span> <span class="mf">0.0</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">products</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">product</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="k">return</span> <span class="n">products</span><span class="o">;</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// Extract the product id from each recommendation</span>
+ <span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;&gt;</span> <span class="n">userRecommendedList</span> <span class="o">=</span> <span class="n">userRecommended</span><span class="o">.</span><span class="na">mapValues</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">[],</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">Rating</span><span class="o">[]</span> <span class="n">docs</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;</span> <span class="n">products</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;();</span>
+ <span class="k">for</span> <span class="o">(</span><span class="n">Rating</span> <span class="n">r</span> <span class="o">:</span> <span class="n">docs</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">products</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">product</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="k">return</span> <span class="n">products</span><span class="o">;</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">List</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">&gt;&gt;&gt;</span> <span class="n">relevantDocs</span> <span class="o">=</span> <span class="n">userMoviesList</span><span class="o">.</span><span class="na">join</span><span class="o">(</span><span class="n">userRecommendedList</span><span class="o">).</span><span class="na">values</span><span class="o">();</span>
+
+ <span class="c1">// Instantiate the metrics object</span>
+ <span class="n">RankingMetrics</span> <span class="n">metrics</span> <span class="o">=</span> <span class="n">RankingMetrics</span><span class="o">.</span><span class="na">of</span><span class="o">(</span><span class="n">relevantDocs</span><span class="o">);</span>
+
+ <span class="c1">// Precision and NDCG at k</span>
+ <span class="n">Integer</span><span class="o">[]</span> <span class="n">kVector</span> <span class="o">=</span> <span class="o">{</span><span class="mi">1</span><span class="o">,</span> <span class="mi">3</span><span class="o">,</span> <span class="mi">5</span><span class="o">};</span>
+ <span class="k">for</span> <span class="o">(</span><span class="n">Integer</span> <span class="n">k</span> <span class="o">:</span> <span class="n">kVector</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Precision at %d = %f\n&quot;</span><span class="o">,</span> <span class="n">k</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">precisionAt</span><span class="o">(</span><span class="n">k</span><span class="o">));</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;NDCG at %d = %f\n&quot;</span><span class="o">,</span> <span class="n">k</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">ndcgAt</span><span class="o">(</span><span class="n">k</span><span class="o">));</span>
+ <span class="o">}</span>
+
+ <span class="c1">// Mean average precision</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Mean average precision = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">meanAveragePrecision</span><span class="o">());</span>
+
+ <span class="c1">// Evaluate the model using numerical ratings and regression metrics</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">userProducts</span> <span class="o">=</span> <span class="n">ratings</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">Rating</span> <span class="n">r</span><span class="o">)</span> <span class="o">{</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;(</span><span class="n">r</span><span class="o">.</span><span class="na">user</span><span class="o">(),</span> <span class="n">r</span><span class="o">.</span><span class="na">product</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+ <span class="n">JavaPairRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="n">predictions</span> <span class="o">=</span> <span class="n">JavaPairRDD</span><span class="o">.</span><span class="na">fromJavaRDD</span><span class="o">(</span>
+ <span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">JavaRDD</span><span class="o">.</span><span class="na">toRDD</span><span class="o">(</span><span class="n">userProducts</span><span class="o">)).</span><span class="na">toJavaRDD</span><span class="o">().</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">Rating</span> <span class="n">r</span><span class="o">){</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;(</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;(</span><span class="n">r</span><span class="o">.</span><span class="na">user</span><span class="o">(),</span> <span class="n">r</span><span class="o">.</span><span class="na">product</span><span class="o">()),</span> <span class="n">r</span><span class="o">.</span><span class="na">rating</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">));</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">ratesAndPreds</span> <span class="o">=</span>
+ <span class="n">JavaPairRDD</span><span class="o">.</span><span class="na">fromJavaRDD</span><span class="o">(</span><span class="n">ratings</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">Rating</span><span class="o">,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">Rating</span> <span class="n">r</span><span class="o">){</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;,</span> <span class="n">Object</span><span class="o">&gt;(</span>
+ <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;(</span><span class="n">r</span><span class="o">.</span><span class="na">user</span><span class="o">(),</span> <span class="n">r</span><span class="o">.</span><span class="na">product</span><span class="o">()),</span> <span class="n">r</span><span class="o">.</span><span class="na">rating</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">)).</span><span class="na">join</span><span class="o">(</span><span class="n">predictions</span><span class="o">).</span><span class="na">values</span><span class="o">();</span>
+
+ <span class="c1">// Create regression metrics object</span>
+ <span class="n">RegressionMetrics</span> <span class="n">regressionMetrics</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RegressionMetrics</span><span class="o">(</span><span class="n">ratesAndPreds</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Root mean squared error</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;RMSE = %f\n&quot;</span><span class="o">,</span> <span class="n">regressionMetrics</span><span class="o">.</span><span class="na">rootMeanSquaredError</span><span class="o">());</span>
+
+ <span class="c1">// R-squared</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;R-squared = %f\n&quot;</span><span class="o">,</span> <span class="n">regressionMetrics</span><span class="o">.</span><span class="na">r2</span><span class="o">());</span>
+ <span class="o">}</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.recommendation</span> <span class="kn">import</span> <span class="n">ALS</span><span class="p">,</span> <span class="n">Rating</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.evaluation</span> <span class="kn">import</span> <span class="n">RegressionMetrics</span><span class="p">,</span> <span class="n">RankingMetrics</span>
+
+<span class="c"># Read in the ratings data</span>
+<span class="n">lines</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;data/mllib/sample_movielens_data.txt&quot;</span><span class="p">)</span>
+
+<span class="k">def</span> <span class="nf">parseLine</span><span class="p">(</span><span class="n">line</span><span class="p">):</span>
+ <span class="n">fields</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot;::&quot;</span><span class="p">)</span>
+ <span class="k">return</span> <span class="n">Rating</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="nb">int</span><span class="p">(</span><span class="n">fields</span><span class="p">[</span><span class="mi">1</span><span class="p">]),</span> <span class="nb">float</span><span class="p">(</span><span class="n">fields</span><span class="p">[</span><span class="mi">2</span><span class="p">])</span> <span class="o">-</span> <span class="mf">2.5</span><span class="p">)</span>
+<span class="n">ratings</span> <span class="o">=</span> <span class="n">lines</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">r</span><span class="p">:</span> <span class="n">parseLine</span><span class="p">(</span><span class="n">r</span><span class="p">))</span>
+
+<span class="c"># Train a model on to predict user-product ratings</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">ALS</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">ratings</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="mf">0.01</span><span class="p">)</span>
+
+<span class="c"># Get predicted ratings on all existing user-product pairs</span>
+<span class="n">testData</span> <span class="o">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="o">.</span><span class="n">user</span><span class="p">,</span> <span class="n">p</span><span class="o">.</span><span class="n">product</span><span class="p">))</span>
+<span class="n">predictions</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predictAll</span><span class="p">(</span><span class="n">testData</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">r</span><span class="p">:</span> <span class="p">((</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="p">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="p">),</span> <span class="n">r</span><span class="o">.</span><span class="n">rating</span><span class="p">))</span>
+
+<span class="n">ratingsTuple</span> <span class="o">=</span> <span class="n">ratings</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">r</span><span class="p">:</span> <span class="p">((</span><span class="n">r</span><span class="o">.</span><span class="n">user</span><span class="p">,</span> <span class="n">r</span><span class="o">.</span><span class="n">product</span><span class="p">),</span> <span class="n">r</span><span class="o">.</span><span class="n">rating</span><span class="p">))</span>
+<span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="n">predictions</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ratingsTuple</span><span class="p">)</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">tup</span><span class="p">:</span> <span class="n">tup</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+
+<span class="c"># Instantiate regression metrics to compare predicted and actual ratings</span>
+<span class="n">metrics</span> <span class="o">=</span> <span class="n">RegressionMetrics</span><span class="p">(</span><span class="n">scoreAndLabels</span><span class="p">)</span>
+
+<span class="c"># Root mean sqaured error</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;RMSE = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">rootMeanSquaredError</span><span class="p">)</span>
+
+<span class="c"># R-squared</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;R-squared = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">r2</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="regression-model-evaluation">Regression model evaluation</h2>
+
+<p><a href="https://en.wikipedia.org/wiki/Regression_analysis">Regression analysis</a> is used when predicting a continuous output
+variable from a number of independent variables.</p>
+
+<p><strong>Available metrics</strong></p>
+
+<table class="table">
+ <thead>
+ <tr><th>Metric</th><th>Definition</th></tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>Mean Squared Error (MSE)</td>
+ <td>$MSE = \frac{\sum_{i=0}^{N-1} (\mathbf{y}_i - \hat{\mathbf{y}}_i)^2}{N}$</td>
+ </tr>
+ <tr>
+ <td>Root Mean Squared Error (RMSE)</td>
+ <td>$RMSE = \sqrt{\frac{\sum_{i=0}^{N-1} (\mathbf{y}_i - \hat{\mathbf{y}}_i)^2}{N}}$</td>
+ </tr>
+ <tr>
+ <td>Mean Absoloute Error (MAE)</td>
+ <td>$MAE=\sum_{i=0}^{N-1} \left|\mathbf{y}_i - \hat{\mathbf{y}}_i\right|$</td>
+ </tr>
+ <tr>
+ <td>Coefficient of Determination $(R^2)$</td>
+ <td>$R^2=1 - \frac{MSE}{\text{VAR}(\mathbf{y}) \cdot (N-1)}=1-\frac{\sum_{i=0}^{N-1}
+ (\mathbf{y}_i - \hat{\mathbf{y}}_i)^2}{\sum_{i=0}^{N-1}(\mathbf{y}_i-\bar{\mathbf{y}})^2}$</td>
+ </tr>
+ <tr>
+ <td>Explained Variance</td>
+ <td>$1 - \frac{\text{VAR}(\mathbf{y} - \mathbf{\hat{y}})}{\text{VAR}(\mathbf{y})}$</td>
+ </tr>
+ </tbody>
+</table>
+
+<p><strong>Examples</strong></p>
+
+<div class="codetabs">
+The following code snippets illustrate how to load a sample dataset, train a linear regression algorithm on the data,
+and evaluate the performance of the algorithm by several regression metrics.
+
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LinearRegressionModel</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LinearRegressionWithSGD</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.RegressionMetrics</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="c1">// Load the data</span>
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_linear_regression_data.txt&quot;</span><span class="o">).</span><span class="n">cache</span><span class="o">()</span>
+
+<span class="c1">// Build the model</span>
+<span class="k">val</span> <span class="n">numIterations</span> <span class="k">=</span> <span class="mi">100</span>
+<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="nc">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">numIterations</span><span class="o">)</span>
+
+<span class="c1">// Get predictions</span>
+<span class="k">val</span> <span class="n">valuesAndPreds</span> <span class="k">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">{</span> <span class="n">point</span> <span class="k">=&gt;</span>
+ <span class="k">val</span> <span class="n">prediction</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">point</span><span class="o">.</span><span class="n">features</span><span class="o">)</span>
+ <span class="o">(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">point</span><span class="o">.</span><span class="n">label</span><span class="o">)</span>
+<span class="o">}</span>
+
+<span class="c1">// Instantiate metrics object</span>
+<span class="k">val</span> <span class="n">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RegressionMetrics</span><span class="o">(</span><span class="n">valuesAndPreds</span><span class="o">)</span>
+
+<span class="c1">// Squared error</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;MSE = ${metrics.meanSquaredError}&quot;</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;RMSE = ${metrics.rootMeanSquaredError}&quot;</span><span class="o">)</span>
+
+<span class="c1">// R-squared</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;R-squared = ${metrics.r2}&quot;</span><span class="o">)</span>
+
+<span class="c1">// Mean absolute error</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;MAE = ${metrics.meanAbsoluteError}&quot;</span><span class="o">)</span>
+
+<span class="c1">// Explained variance</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Explained variance = ${metrics.explainedVariance}&quot;</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.Function</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LinearRegressionModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LinearRegressionWithSGD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.RegressionMetrics</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.SparkConf</span><span class="o">;</span>
+
+<span class="kd">public</span> <span class="kd">class</span> <span class="nc">LinearRegression</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="kd">static</span> <span class="kt">void</span> <span class="nf">main</span><span class="o">(</span><span class="n">String</span><span class="o">[]</span> <span class="n">args</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">SparkConf</span> <span class="n">conf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">SparkConf</span><span class="o">().</span><span class="na">setAppName</span><span class="o">(</span><span class="s">&quot;Linear Regression Example&quot;</span><span class="o">);</span>
+ <span class="n">JavaSparkContext</span> <span class="n">sc</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">JavaSparkContext</span><span class="o">(</span><span class="n">conf</span><span class="o">);</span>
+
+ <span class="c1">// Load and parse the data</span>
+ <span class="n">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">&quot;data/mllib/sample_linear_regression_data.txt&quot;</span><span class="o">;</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="na">textFile</span><span class="o">(</span><span class="n">path</span><span class="o">);</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">parsedData</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">String</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">LabeledPoint</span> <span class="nf">call</span><span class="o">(</span><span class="n">String</span> <span class="n">line</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">String</span><span class="o">[]</span> <span class="n">parts</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">);</span>
+ <span class="kt">double</span><span class="o">[]</span> <span class="n">v</span> <span class="o">=</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[</span><span class="n">parts</span><span class="o">.</span><span class="na">length</span> <span class="o">-</span> <span class="mi">1</span><span class="o">];</span>
+ <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">1</span><span class="o">;</span> <span class="n">i</span> <span class="o">&lt;</span> <span class="n">parts</span><span class="o">.</span><span class="na">length</span> <span class="o">-</span> <span class="mi">1</span><span class="o">;</span> <span class="n">i</span><span class="o">++)</span>
+ <span class="n">v</span><span class="o">[</span><span class="n">i</span> <span class="o">-</span> <span class="mi">1</span><span class="o">]</span> <span class="o">=</span> <span class="n">Double</span><span class="o">.</span><span class="na">parseDouble</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="n">i</span><span class="o">].</span><span class="na">split</span><span class="o">(</span><span class="s">&quot;:&quot;</span><span class="o">)[</span><span class="mi">1</span><span class="o">]);</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="nf">LabeledPoint</span><span class="o">(</span><span class="n">Double</span><span class="o">.</span><span class="na">parseDouble</span><span class="o">(</span><span class="n">parts</span><span class="o">[</span><span class="mi">0</span><span class="o">]),</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="n">v</span><span class="o">));</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+ <span class="n">parsedData</span><span class="o">.</span><span class="na">cache</span><span class="o">();</span>
+
+ <span class="c1">// Building the model</span>
+ <span class="kt">int</span> <span class="n">numIterations</span> <span class="o">=</span> <span class="mi">100</span><span class="o">;</span>
+ <span class="kd">final</span> <span class="n">LinearRegressionModel</span> <span class="n">model</span> <span class="o">=</span>
+ <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="na">train</span><span class="o">(</span><span class="n">JavaRDD</span><span class="o">.</span><span class="na">toRDD</span><span class="o">(</span><span class="n">parsedData</span><span class="o">),</span> <span class="n">numIterations</span><span class="o">);</span>
+
+ <span class="c1">// Evaluate model on training examples and compute training error</span>
+ <span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;</span> <span class="n">valuesAndPreds</span> <span class="o">=</span> <span class="n">parsedData</span><span class="o">.</span><span class="na">map</span><span class="o">(</span>
+ <span class="k">new</span> <span class="n">Function</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">,</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;&gt;()</span> <span class="o">{</span>
+ <span class="kd">public</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;</span> <span class="nf">call</span><span class="o">(</span><span class="n">LabeledPoint</span> <span class="n">point</span><span class="o">)</span> <span class="o">{</span>
+ <span class="kt">double</span> <span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">point</span><span class="o">.</span><span class="na">features</span><span class="o">());</span>
+ <span class="k">return</span> <span class="k">new</span> <span class="n">Tuple2</span><span class="o">&lt;</span><span class="n">Object</span><span class="o">,</span> <span class="n">Object</span><span class="o">&gt;(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">point</span><span class="o">.</span><span class="na">label</span><span class="o">());</span>
+ <span class="o">}</span>
+ <span class="o">}</span>
+ <span class="o">);</span>
+
+ <span class="c1">// Instantiate metrics object</span>
+ <span class="n">RegressionMetrics</span> <span class="n">metrics</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RegressionMetrics</span><span class="o">(</span><span class="n">valuesAndPreds</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
+
+ <span class="c1">// Squared error</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;MSE = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">meanSquaredError</span><span class="o">());</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;RMSE = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">rootMeanSquaredError</span><span class="o">());</span>
+
+ <span class="c1">// R-squared</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;R Squared = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">r2</span><span class="o">());</span>
+
+ <span class="c1">// Mean absolute error</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;MAE = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">meanAbsoluteError</span><span class="o">());</span>
+
+ <span class="c1">// Explained variance</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">format</span><span class="o">(</span><span class="s">&quot;Explained Variance = %f\n&quot;</span><span class="o">,</span> <span class="n">metrics</span><span class="o">.</span><span class="na">explainedVariance</span><span class="o">());</span>
+
+ <span class="c1">// Save and load model</span>
+ <span class="n">model</span><span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="n">sc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> <span class="s">&quot;myModelPath&quot;</span><span class="o">);</span>
+ <span class="n">LinearRegressionModel</span> <span class="n">sameModel</span> <span class="o">=</span> <span class="n">LinearRegressionModel</span><span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="n">sc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> <span class="s">&quot;myModelPath&quot;</span><span class="o">);</span>
+ <span class="o">}</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">LabeledPoint</span><span class="p">,</span> <span class="n">LinearRegressionWithSGD</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.evaluation</span> <span class="kn">import</span> <span class="n">RegressionMetrics</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">DenseVector</span>
+
+<span class="c"># Load and parse the data</span>
+<span class="k">def</span> <span class="nf">parsePoint</span><span class="p">(</span><span class="n">line</span><span class="p">):</span>
+ <span class="n">values</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="n">split</span><span class="p">()</span>
+ <span class="k">return</span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="n">DenseVector</span><span class="p">([</span><span class="nb">float</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&#39;:&#39;</span><span class="p">)[</span><span class="mi">1</span><span class="p">])</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">values</span><span class="p">[</span><span class="mi">1</span><span class="p">:]]))</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">&quot;data/mllib/sample_linear_regression_data.txt&quot;</span><span class="p">)</span>
+<span class="n">parsedData</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">parsePoint</span><span class="p">)</span>
+
+<span class="c"># Build the model</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">LinearRegressionWithSGD</span><span class="o">.</span><span class="n">train</span><span class="p">(</span><span class="n">parsedData</span><span class="p">)</span>
+
+<span class="c"># Get predictions</span>
+<span class="n">valuesAndPreds</span> <span class="o">=</span> <span class="n">parsedData</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">predict</span><span class="p">(</span><span class="n">p</span><span class="o">.</span><span class="n">features</span><span class="p">)),</span> <span class="n">p</span><span class="o">.</span><span class="n">label</span><span class="p">))</span>
+
+<span class="c"># Instantiate metrics object</span>
+<span class="n">metrics</span> <span class="o">=</span> <span class="n">RegressionMetrics</span><span class="p">(</span><span class="n">valuesAndPreds</span><span class="p">)</span>
+
+<span class="c"># Squared Error</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;MSE = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">meanSquaredError</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;RMSE = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">rootMeanSquaredError</span><span class="p">)</span>
+
+<span class="c"># R-squared</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;R-squared = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">r2</span><span class="p">)</span>
+
+<span class="c"># Mean absolute error</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;MAE = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">meanAbsoluteError</span><span class="p">)</span>
+
+<span class="c"># Explained variance</span>
+<span class="k">print</span><span class="p">(</span><span class="s">&quot;Explained variance = </span><span class="si">%s</span><span class="s">&quot;</span> <span class="o">%</span> <span class="n">metrics</span><span class="o">.</span><span class="n">explainedVariance</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+
+ </div> <!-- /container -->
+
+ <script src="js/vendor/jquery-1.8.0.min.js"></script>
+ <script src="js/vendor/bootstrap.min.js"></script>
+ <script src="js/vendor/anchor.min.js"></script>
+ <script src="js/main.js"></script>
+
+ <!-- MathJax Section -->
+ <script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ TeX: { equationNumbers: { autoNumber: "AMS" } }
+ });
+ </script>
+ <script>
+ // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
+ // We could use "//cdn.mathjax...", but that won't support "file://".
+ (function(d, script) {
+ script = d.createElement('script');
+ script.type = 'text/javascript';
+ script.async = true;
+ script.onload = function(){
+ MathJax.Hub.Config({
+ tex2jax: {
+ inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+ displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+ processEscapes: true,
+ skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+ }
+ });
+ };
+ script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
+ 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+ d.getElementsByTagName('head')[0].appendChild(script);
+ }(document));
+ </script>
+ </body>
+</html>