summaryrefslogtreecommitdiff
path: root/site/docs/1.5.0/ml-features.html
diff options
context:
space:
mode:
authorReynold Xin <rxin@apache.org>2015-09-17 22:11:21 +0000
committerReynold Xin <rxin@apache.org>2015-09-17 22:11:21 +0000
commit6f57b0c45a7d1b6255067c6e9bc549baa491acac (patch)
treedbf7d7a7700e9e6bad3c8289ab831bc9c2c20d62 /site/docs/1.5.0/ml-features.html
parentee9ffe89d608e7640a2487406b618d27e58026d6 (diff)
downloadspark-website-6f57b0c45a7d1b6255067c6e9bc549baa491acac.tar.gz
spark-website-6f57b0c45a7d1b6255067c6e9bc549baa491acac.tar.bz2
spark-website-6f57b0c45a7d1b6255067c6e9bc549baa491acac.zip
add 1.5.0 back
Diffstat (limited to 'site/docs/1.5.0/ml-features.html')
-rw-r--r--site/docs/1.5.0/ml-features.html2167
1 files changed, 2167 insertions, 0 deletions
diff --git a/site/docs/1.5.0/ml-features.html b/site/docs/1.5.0/ml-features.html
new file mode 100644
index 000000000..e6033bd36
--- /dev/null
+++ b/site/docs/1.5.0/ml-features.html
@@ -0,0 +1,2167 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+ <title>Feature Extraction, Transformation, and Selection - SparkML - Spark 1.5.0 Documentation</title>
+
+
+
+
+ <link rel="stylesheet" href="css/bootstrap.min.css">
+ <style>
+ body {
+ padding-top: 60px;
+ padding-bottom: 40px;
+ }
+ </style>
+ <meta name="viewport" content="width=device-width">
+ <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
+ <link rel="stylesheet" href="css/main.css">
+
+ <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
+
+ <link rel="stylesheet" href="css/pygments-default.css">
+
+
+ <!-- Google analytics script -->
+ <script type="text/javascript">
+ var _gaq = _gaq || [];
+ _gaq.push(['_setAccount', 'UA-32518208-2']);
+ _gaq.push(['_trackPageview']);
+
+ (function() {
+ var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+ ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+ })();
+ </script>
+
+
+ </head>
+ <body>
+ <!--[if lt IE 7]>
+ <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
+ <![endif]-->
+
+ <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
+
+ <div class="navbar navbar-fixed-top" id="topbar">
+ <div class="navbar-inner">
+ <div class="container">
+ <div class="brand"><a href="index.html">
+ <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.5.0</span>
+ </div>
+ <ul class="nav">
+ <!--TODO(andyk): Add class="active" attribute to li some how.-->
+ <li><a href="index.html">Overview</a></li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="quick-start.html">Quick Start</a></li>
+ <li><a href="programming-guide.html">Spark Programming Guide</a></li>
+ <li class="divider"></li>
+ <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+ <li><a href="sql-programming-guide.html">DataFrames and SQL</a></li>
+ <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
+ <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
+ <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+ <li><a href="sparkr.html">SparkR (R on Spark)</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
+ <li><a href="api/java/index.html">Java</a></li>
+ <li><a href="api/python/index.html">Python</a></li>
+ <li><a href="api/R/index.html">R</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="cluster-overview.html">Overview</a></li>
+ <li><a href="submitting-applications.html">Submitting Applications</a></li>
+ <li class="divider"></li>
+ <li><a href="spark-standalone.html">Spark Standalone</a></li>
+ <li><a href="running-on-mesos.html">Mesos</a></li>
+ <li><a href="running-on-yarn.html">YARN</a></li>
+ <li class="divider"></li>
+ <li><a href="ec2-scripts.html">Amazon EC2</a></li>
+ </ul>
+ </li>
+
+ <li class="dropdown">
+ <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
+ <ul class="dropdown-menu">
+ <li><a href="configuration.html">Configuration</a></li>
+ <li><a href="monitoring.html">Monitoring</a></li>
+ <li><a href="tuning.html">Tuning Guide</a></li>
+ <li><a href="job-scheduling.html">Job Scheduling</a></li>
+ <li><a href="security.html">Security</a></li>
+ <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
+ <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
+ <li class="divider"></li>
+ <li><a href="building-spark.html">Building Spark</a></li>
+ <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+ <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
+ </ul>
+ </li>
+ </ul>
+ <!--<p class="navbar-text pull-right"><span class="version-text">v1.5.0</span></p>-->
+ </div>
+ </div>
+ </div>
+
+ <div class="container" id="content">
+
+ <h1 class="title"><a href="ml-guide.html">ML</a> - Features</h1>
+
+
+ <p>This section covers algorithms for working with features, roughly divided into these groups:</p>
+
+<ul>
+ <li>Extraction: Extracting features from &#8220;raw&#8221; data</li>
+ <li>Transformation: Scaling, converting, or modifying features</li>
+ <li>Selection: Selecting a subset from a larger set of features</li>
+</ul>
+
+<p><strong>Table of Contents</strong></p>
+
+<ul id="markdown-toc">
+ <li><a href="#feature-extractors">Feature Extractors</a> <ul>
+ <li><a href="#tf-idf-hashingtf-and-idf">TF-IDF (HashingTF and IDF)</a></li>
+ <li><a href="#word2vec">Word2Vec</a></li>
+ <li><a href="#countvectorizer">CountVectorizer</a></li>
+ </ul>
+ </li>
+ <li><a href="#feature-transformers">Feature Transformers</a> <ul>
+ <li><a href="#tokenizer">Tokenizer</a></li>
+ <li><a href="#stopwordsremover">StopWordsRemover</a></li>
+ <li><a href="#n-gram">$n$-gram</a></li>
+ <li><a href="#binarizer">Binarizer</a></li>
+ <li><a href="#pca">PCA</a></li>
+ <li><a href="#polynomialexpansion">PolynomialExpansion</a></li>
+ <li><a href="#discrete-cosine-transform-dct">Discrete Cosine Transform (DCT)</a></li>
+ <li><a href="#stringindexer">StringIndexer</a></li>
+ <li><a href="#onehotencoder">OneHotEncoder</a></li>
+ <li><a href="#vectorindexer">VectorIndexer</a></li>
+ <li><a href="#normalizer">Normalizer</a></li>
+ <li><a href="#standardscaler">StandardScaler</a></li>
+ <li><a href="#minmaxscaler">MinMaxScaler</a></li>
+ <li><a href="#bucketizer">Bucketizer</a></li>
+ <li><a href="#elementwiseproduct">ElementwiseProduct</a></li>
+ <li><a href="#vectorassembler">VectorAssembler</a></li>
+ </ul>
+ </li>
+ <li><a href="#feature-selectors">Feature Selectors</a> <ul>
+ <li><a href="#vectorslicer">VectorSlicer</a></li>
+ <li><a href="#rformula">RFormula</a></li>
+ </ul>
+ </li>
+</ul>
+
+<h1 id="feature-extractors">Feature Extractors</h1>
+
+<h2 id="tf-idf-hashingtf-and-idf">TF-IDF (HashingTF and IDF)</h2>
+
+<p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term Frequency-Inverse Document Frequency (TF-IDF)</a> is a common text pre-processing step. In Spark ML, TF-IDF is separate into two parts: TF (+hashing) and IDF.</p>
+
+<p><strong>TF</strong>: <code>HashingTF</code> is a <code>Transformer</code> which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a &#8220;set of terms&#8221; might be a bag of words.
+The algorithm combines Term Frequency (TF) counts with the <a href="http://en.wikipedia.org/wiki/Feature_hashing">hashing trick</a> for dimensionality reduction.</p>
+
+<p><strong>IDF</strong>: <code>IDF</code> is an <code>Estimator</code> which fits on a dataset and produces an <code>IDFModel</code>. The <code>IDFModel</code> takes feature vectors (generally created from <code>HashingTF</code>) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus.</p>
+
+<p>Please refer to the <a href="mllib-feature-extraction.html#tf-idf">MLlib user guide on TF-IDF</a> for more details on Term Frequency and Inverse Document Frequency.
+For API details, refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.HashingTF">HashingTF API docs</a> and the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF API docs</a>.</p>
+
+<p>In the following code segment, we start with a set of sentences. We split each sentence into words using <code>Tokenizer</code>. For each sentence (bag of words), we use <code>HashingTF</code> to hash the sentence into a feature vector. We use <code>IDF</code> to rescale the feature vectors; this generally improves performance when using text as features. Our feature vectors could then be passed to a learning algorithm.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">IDF</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span>
+
+<span class="k">val</span> <span class="n">sentenceData</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;Logistic regression models are neat&quot;</span><span class="o">)</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="s">&quot;sentence&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">tokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Tokenizer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">wordsData</span> <span class="k">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">sentenceData</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">hashingTF</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">HashingTF</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;rawFeatures&quot;</span><span class="o">).</span><span class="n">setNumFeatures</span><span class="o">(</span><span class="mi">20</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">featurizedData</span> <span class="k">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">wordsData</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">idf</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">IDF</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;rawFeatures&quot;</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">idfModel</span> <span class="k">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">rescaledData</span> <span class="k">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">)</span>
+<span class="n">rescaledData</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.HashingTF</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.IDF</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Tokenizer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;Logistic regression models are neat&quot;</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">wordsData</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">sentenceData</span><span class="o">);</span>
+<span class="kt">int</span> <span class="n">numFeatures</span> <span class="o">=</span> <span class="mi">20</span><span class="o">;</span>
+<span class="n">HashingTF</span> <span class="n">hashingTF</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">HashingTF</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;rawFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setNumFeatures</span><span class="o">(</span><span class="n">numFeatures</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">featurizedData</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">wordsData</span><span class="o">);</span>
+<span class="n">IDF</span> <span class="n">idf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">IDF</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;rawFeatures&quot;</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">);</span>
+<span class="n">IDFModel</span> <span class="n">idfModel</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">rescaledData</span> <span class="o">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">rescaledData</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="na">take</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span>
+ <span class="n">Vector</span> <span class="n">features</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getAs</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+ <span class="n">Double</span> <span class="n">label</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getDouble</span><span class="o">(</span><span class="mi">1</span><span class="o">);</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">features</span><span class="o">);</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">IDF</span><span class="p">,</span> <span class="n">Tokenizer</span>
+
+<span class="n">sentenceData</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
+ <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">&quot;Logistic regression models are neat&quot;</span><span class="p">)</span>
+<span class="p">],</span> <span class="p">[</span><span class="s">&quot;label&quot;</span><span class="p">,</span> <span class="s">&quot;sentence&quot;</span><span class="p">])</span>
+<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;sentence&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;words&quot;</span><span class="p">)</span>
+<span class="n">wordsData</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceData</span><span class="p">)</span>
+<span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;words&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;rawFeatures&quot;</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span>
+<span class="n">featurizedData</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">wordsData</span><span class="p">)</span>
+<span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;rawFeatures&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">)</span>
+<span class="n">idfModel</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">featurizedData</span><span class="p">)</span>
+<span class="n">rescaledData</span> <span class="o">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">featurizedData</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">features_label</span> <span class="ow">in</span> <span class="n">rescaledData</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="s">&quot;label&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">features_label</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="word2vec">Word2Vec</h2>
+
+<p><code>Word2Vec</code> is an <code>Estimator</code> which takes sequences of words that represents documents and trains a <code>Word2VecModel</code>. The model is a <code>Map(String, Vector)</code> essentially, which maps each word to an unique fix-sized vector. The <code>Word2VecModel</code> transforms each documents into a vector using the average of all words in the document, which aims to other computations of documents such as similarity calculation consequencely. Please refer to the <a href="mllib-feature-extraction.html#Word2Vec">MLlib user guide on Word2Vec</a> for more details on Word2Vec.</p>
+
+<p>Word2Vec is implemented in <a href="api/scala/index.html#org.apache.spark.ml.feature.Word2Vec">Word2Vec</a>. In the following code segment, we start with a set of documents, each of them is represented as a sequence of words. For each document, we transform it into a feature vector. This feature vector could then be passed to a learning algorithm.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Word2Vec</span>
+
+<span class="c1">// Input data: Each row is a bag of words from a sentence or document.</span>
+<span class="k">val</span> <span class="n">documentDF</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">),</span>
+ <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">),</span>
+ <span class="s">&quot;Logistic regression models are neat&quot;</span><span class="o">.</span><span class="n">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">)</span>
+<span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">)</span>
+
+<span class="c1">// Learn a mapping from words to Vectors.</span>
+<span class="k">val</span> <span class="n">word2Vec</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Word2Vec</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;result&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setVectorSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setMinCount</span><span class="o">(</span><span class="mi">0</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">model</span> <span class="k">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">documentDF</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">documentDF</span><span class="o">)</span>
+<span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;result&quot;</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaSparkContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.SQLContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span>
+
+<span class="n">JavaSparkContext</span> <span class="n">jsc</span> <span class="o">=</span> <span class="o">...</span>
+<span class="n">SQLContext</span> <span class="n">sqlContext</span> <span class="o">=</span> <span class="o">...</span>
+
+<span class="c1">// Input data: Each row is a bag of words from a sentence or document.</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">))),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">))),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;Logistic regression models are neat&quot;</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">&quot; &quot;</span><span class="o">)))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">documentDF</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="c1">// Learn a mapping from words to Vectors.</span>
+<span class="n">Word2Vec</span> <span class="n">word2Vec</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Word2Vec</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;result&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setVectorSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setMinCount</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+<span class="n">Word2VecModel</span> <span class="n">model</span> <span class="o">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">documentDF</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">documentDF</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="nl">r:</span> <span class="n">result</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;result&quot;</span><span class="o">).</span><span class="na">take</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">r</span><span class="o">);</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Word2Vec</span>
+
+<span class="c"># Input data: Each row is a bag of words from a sentence or document.</span>
+<span class="n">documentDF</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
+ <span class="p">(</span><span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot; &quot;</span><span class="p">),</span> <span class="p">),</span>
+ <span class="p">(</span><span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot; &quot;</span><span class="p">),</span> <span class="p">),</span>
+ <span class="p">(</span><span class="s">&quot;Logistic regression models are neat&quot;</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&quot; &quot;</span><span class="p">),</span> <span class="p">)</span>
+<span class="p">],</span> <span class="p">[</span><span class="s">&quot;text&quot;</span><span class="p">])</span>
+<span class="c"># Learn a mapping from words to Vectors.</span>
+<span class="n">word2Vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;text&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;result&quot;</span><span class="p">)</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">documentDF</span><span class="p">)</span>
+<span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">documentDF</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">feature</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;result&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">feature</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="countvectorizer">CountVectorizer</h2>
+
+<p><code>CountVectorizer</code> and <code>CountVectorizerModel</code> aim to help convert a collection of text documents
+ to vectors of token counts. When an a-priori dictionary is not available, <code>CountVectorizer</code> can
+ be used as an <code>Estimator</code> to extract the vocabulary and generates a <code>CountVectorizerModel</code>. The
+ model produces sparse representations for the documents over the vocabulary, which can then be
+ passed to other algorithms like LDA.</p>
+
+<p>During the fitting process, <code>CountVectorizer</code> will select the top <code>vocabSize</code> words ordered by
+ term frequency across the corpus. An optional parameter &#8220;minDF&#8221; also affect the fitting process
+ by specifying the minimum number (or fraction if &lt; 1.0) of documents a term must appear in to be
+ included in the vocabulary.</p>
+
+<p><strong>Examples</strong></p>
+
+<p>Assume that we have the following DataFrame with columns <code>id</code> and <code>texts</code>:</p>
+
+<pre><code> id | texts
+----|----------
+ 0 | Array("a", "b", "c")
+ 1 | Array("a", "b", "b", "c", "a")
+</code></pre>
+
+<p>each row in<code>texts</code> is a document of type Array[String].
+Invoking fit of <code>CountVectorizer</code> produces a <code>CountVectorizerModel</code> with vocabulary (a, b, c),
+then the output column &#8220;vector&#8221; after transformation contains:</p>
+
+<pre><code> id | texts | vector
+----|---------------------------------|---------------
+ 0 | Array("a", "b", "c") | (3,[0,1,2],[1.0,1.0,1.0])
+ 1 | Array("a", "b", "b", "c", "a") | (3,[0,1,2],[2.0,2.0,1.0])
+</code></pre>
+
+<p>each vector represents the token counts of the document over the vocabulary.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+ <p>More details can be found in the API docs for
+<a href="api/scala/index.html#org.apache.spark.ml.feature.CountVectorizer">CountVectorizer</a> and
+<a href="api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel">CountVectorizerModel</a>.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.CountVectorizer</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.CountVectorizerModel</span>
+
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">)),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">))</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;words&quot;</span><span class="o">)</span>
+
+<span class="c1">// fit a CountVectorizerModel from the corpus</span>
+<span class="k">val</span> <span class="n">cvModel</span><span class="k">:</span> <span class="kt">CountVectorizerModel</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">CountVectorizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setVocabSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setMinDF</span><span class="o">(</span><span class="mi">2</span><span class="o">)</span> <span class="c1">// a term must appear in more or equal to 2 documents to be included in the vocabulary</span>
+ <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+
+<span class="c1">// alternatively, define CountVectorizerModel with a-priori vocabulary</span>
+<span class="k">val</span> <span class="n">cvm</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">CountVectorizerModel</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">))</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+
+<span class="n">cvModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">).</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+ <p>More details can be found in the API docs for
+<a href="api/java/org/apache/spark/ml/feature/CountVectorizer.html">CountVectorizer</a> and
+<a href="api/java/org/apache/spark/ml/feature/CountVectorizerModel.html">CountVectorizerModel</a>.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.CountVectorizer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.CountVectorizerModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span>
+
+<span class="c1">// Input data: Each row is a bag of words from a sentence or document.</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span> <span class="o">[]</span> <span class="o">{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="c1">// fit a CountVectorizerModel from the corpus</span>
+<span class="n">CountVectorizerModel</span> <span class="n">cvModel</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">CountVectorizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;feature&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setVocabSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setMinDF</span><span class="o">(</span><span class="mi">2</span><span class="o">)</span> <span class="c1">// a term must appear in more or equal to 2 documents to be included in the vocabulary</span>
+ <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+
+<span class="c1">// alternatively, define CountVectorizerModel with a-priori vocabulary</span>
+<span class="n">CountVectorizerModel</span> <span class="n">cvm</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">CountVectorizerModel</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">})</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;text&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;feature&quot;</span><span class="o">);</span>
+
+<span class="n">cvModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+ </div>
+</div>
+
+<h1 id="feature-transformers">Feature Transformers</h1>
+
+<h2 id="tokenizer">Tokenizer</h2>
+
+<p><a href="http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization">Tokenization</a> is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple <a href="api/scala/index.html#org.apache.spark.ml.feature.Tokenizer">Tokenizer</a> class provides this functionality. The example below shows how to split sentences into sequences of words.</p>
+
+<p><a href="api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer">RegexTokenizer</a> allows more
+ advanced tokenization based on regular expression (regex) matching.
+ By default, the parameter &#8220;pattern&#8221; (regex, default: \s+) is used as delimiters to split the input text.
+ Alternatively, users can set parameter &#8220;gaps&#8221; to false indicating the regex &#8220;pattern&#8221; denotes
+ &#8220;tokens&#8221; rather than splitting gaps, and find all matching occurrences as the tokenization result.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">Tokenizer</span><span class="o">,</span> <span class="nc">RegexTokenizer</span><span class="o">}</span>
+
+<span class="k">val</span> <span class="n">sentenceDataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">&quot;Logistic,regression,models,are,neat&quot;</span><span class="o">)</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="s">&quot;sentence&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">tokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Tokenizer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">regexTokenizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RegexTokenizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setPattern</span><span class="o">(</span><span class="s">&quot;\\W&quot;</span><span class="o">)</span> <span class="c1">// alternatively .setPattern(&quot;\\w+&quot;).setGaps(false)</span>
+
+<span class="k">val</span> <span class="n">tokenized</span> <span class="k">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">sentenceDataFrame</span><span class="o">)</span>
+<span class="n">tokenized</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">regexTokenized</span> <span class="k">=</span> <span class="n">regexTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">sentenceDataFrame</span><span class="o">)</span>
+<span class="n">regexTokenized</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.RegexTokenizer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Tokenizer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">&quot;Logistic,regression,models,are,neat&quot;</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">sentenceDataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">wordsDataFrame</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">sentenceDataFrame</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">wordsDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="na">take</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span>
+ <span class="n">java</span><span class="o">.</span><span class="na">util</span><span class="o">.</span><span class="na">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">words</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getList</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+ <span class="k">for</span> <span class="o">(</span><span class="n">String</span> <span class="n">word</span> <span class="o">:</span> <span class="n">words</span><span class="o">)</span> <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="n">word</span> <span class="o">+</span> <span class="s">&quot; &quot;</span><span class="o">);</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">();</span>
+<span class="o">}</span>
+
+<span class="n">RegexTokenizer</span> <span class="n">regexTokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RegexTokenizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;sentence&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setPattern</span><span class="o">(</span><span class="s">&quot;\\W&quot;</span><span class="o">);</span> <span class="c1">// alternatively .setPattern(&quot;\\w+&quot;).setGaps(false);</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Tokenizer</span><span class="p">,</span> <span class="n">RegexTokenizer</span>
+
+<span class="n">sentenceDataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
+ <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">&quot;Hi I heard about Spark&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">&quot;I wish Java could use case classes&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">&quot;Logistic,regression,models,are,neat&quot;</span><span class="p">)</span>
+<span class="p">],</span> <span class="p">[</span><span class="s">&quot;label&quot;</span><span class="p">,</span> <span class="s">&quot;sentence&quot;</span><span class="p">])</span>
+<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;sentence&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;words&quot;</span><span class="p">)</span>
+<span class="n">wordsDataFrame</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceDataFrame</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">words_label</span> <span class="ow">in</span> <span class="n">wordsDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;words&quot;</span><span class="p">,</span> <span class="s">&quot;label&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">words_label</span><span class="p">)</span>
+<span class="n">regexTokenizer</span> <span class="o">=</span> <span class="n">RegexTokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;sentence&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;words&quot;</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s">&quot;</span><span class="se">\\</span><span class="s">W&quot;</span><span class="p">)</span>
+<span class="c"># alternatively, pattern=&quot;\\w+&quot;, gaps(False)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="stopwordsremover">StopWordsRemover</h2>
+<p><a href="https://en.wikipedia.org/wiki/Stop_words">Stop words</a> are words which
+should be excluded from the input, typically because the words appear
+frequently and don&#8217;t carry as much meaning.</p>
+
+<p><code>StopWordsRemover</code> takes as input a sequence of strings (e.g. the output
+of a <a href="ml-features.html#tokenizer">Tokenizer</a>) and drops all the stop
+words from the input sequences. The list of stopwords is specified by
+the <code>stopWords</code> parameter. We provide <a href="http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words">a list of stop
+words</a> by
+default, accessible by calling <code>getStopWords</code> on a newly instantiated
+<code>StopWordsRemover</code> instance.</p>
+
+<p><strong>Examples</strong></p>
+
+<p>Assume that we have the following DataFrame with columns <code>id</code> and <code>raw</code>:</p>
+
+<pre><code> id | raw
+----|----------
+ 0 | [I, saw, the, red, baloon]
+ 1 | [Mary, had, a, little, lamb]
+</code></pre>
+
+<p>Applying <code>StopWordsRemover</code> with <code>raw</code> as the input column and <code>filtered</code> as the output
+column, we should get the following:</p>
+
+<pre><code> id | raw | filtered
+----|-----------------------------|--------------------
+ 0 | [I, saw, the, red, baloon] | [saw, red, baloon]
+ 1 | [Mary, had, a, little, lamb]|[Mary, little, lamb]
+</code></pre>
+
+<p>In <code>filtered</code>, the stop words &#8220;I&#8221;, &#8220;the&#8221;, &#8220;had&#8221;, and &#8220;a&#8221; have been
+filtered out.</p>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <p><a href="api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover"><code>StopWordsRemover</code></a>
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span>
+
+<span class="k">val</span> <span class="n">remover</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StopWordsRemover</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;raw&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;filtered&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dataSet</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">(</span><span class="s">&quot;I&quot;</span><span class="o">,</span> <span class="s">&quot;saw&quot;</span><span class="o">,</span> <span class="s">&quot;the&quot;</span><span class="o">,</span> <span class="s">&quot;red&quot;</span><span class="o">,</span> <span class="s">&quot;baloon&quot;</span><span class="o">)),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">(</span><span class="s">&quot;Mary&quot;</span><span class="o">,</span> <span class="s">&quot;had&quot;</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;little&quot;</span><span class="o">,</span> <span class="s">&quot;lamb&quot;</span><span class="o">))</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;raw&quot;</span><span class="o">)</span>
+
+<span class="n">remover</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataSet</span><span class="o">).</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p><a href="api/java/org/apache/spark/ml/feature/StopWordsRemover.html"><code>StopWordsRemover</code></a>
+takes an input column name, an output column name, a list of stop words,
+and a boolean indicating if the matches should be case sensitive (false
+by default).</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">StopWordsRemover</span> <span class="n">remover</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StopWordsRemover</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;raw&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;filtered&quot;</span><span class="o">);</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">rdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;I&quot;</span><span class="o">,</span> <span class="s">&quot;saw&quot;</span><span class="o">,</span> <span class="s">&quot;the&quot;</span><span class="o">,</span> <span class="s">&quot;red&quot;</span><span class="o">,</span> <span class="s">&quot;baloon&quot;</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;Mary&quot;</span><span class="o">,</span> <span class="s">&quot;had&quot;</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="s">&quot;little&quot;</span><span class="o">,</span> <span class="s">&quot;lamb&quot;</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;raw&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="n">remover</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">).</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="n-gram">$n$-gram</h2>
+
+<p>An <a href="https://en.wikipedia.org/wiki/N-gram">n-gram</a> is a sequence of $n$ tokens (typically words) for some integer $n$. The <code>NGram</code> class can be used to transform input features into $n$-grams.</p>
+
+<p><code>NGram</code> takes as input a sequence of strings (e.g. the output of a <a href="ml-features.html#tokenizer">Tokenizer</a>). The parameter <code>n</code> is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than <code>n</code> strings, no output is produced.</p>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <p><a href="api/scala/index.html#org.apache.spark.ml.feature.NGram"><code>NGram</code></a> takes an input column name, an output column name, and an optional length parameter n (n=2 by default).</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span>
+
+<span class="k">val</span> <span class="n">wordDataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">&quot;Hi&quot;</span><span class="o">,</span> <span class="s">&quot;I&quot;</span><span class="o">,</span> <span class="s">&quot;heard&quot;</span><span class="o">,</span> <span class="s">&quot;about&quot;</span><span class="o">,</span> <span class="s">&quot;Spark&quot;</span><span class="o">)),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">&quot;I&quot;</span><span class="o">,</span> <span class="s">&quot;wish&quot;</span><span class="o">,</span> <span class="s">&quot;Java&quot;</span><span class="o">,</span> <span class="s">&quot;could&quot;</span><span class="o">,</span> <span class="s">&quot;use&quot;</span><span class="o">,</span> <span class="s">&quot;case&quot;</span><span class="o">,</span> <span class="s">&quot;classes&quot;</span><span class="o">)),</span>
+ <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">&quot;Logistic&quot;</span><span class="o">,</span> <span class="s">&quot;regression&quot;</span><span class="o">,</span> <span class="s">&quot;models&quot;</span><span class="o">,</span> <span class="s">&quot;are&quot;</span><span class="o">,</span> <span class="s">&quot;neat&quot;</span><span class="o">))</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="s">&quot;words&quot;</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">ngram</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">NGram</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;ngrams&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">ngramDataFrame</span> <span class="k">=</span> <span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">wordDataFrame</span><span class="o">)</span>
+<span class="n">ngramDataFrame</span><span class="o">.</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">getAs</span><span class="o">[</span><span class="kt">Stream</span><span class="o">[</span><span class="kt">String</span><span class="o">]](</span><span class="s">&quot;ngrams&quot;</span><span class="o">).</span><span class="n">toList</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p><a href="api/java/org/apache/spark/ml/feature/NGram.html"><code>NGram</code></a> takes an input column name, an output column name, and an optional length parameter n (n=2 by default).</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;Hi&quot;</span><span class="o">,</span> <span class="s">&quot;I&quot;</span><span class="o">,</span> <span class="s">&quot;heard&quot;</span><span class="o">,</span> <span class="s">&quot;about&quot;</span><span class="o">,</span> <span class="s">&quot;Spark&quot;</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;I&quot;</span><span class="o">,</span> <span class="s">&quot;wish&quot;</span><span class="o">,</span> <span class="s">&quot;Java&quot;</span><span class="o">,</span> <span class="s">&quot;could&quot;</span><span class="o">,</span> <span class="s">&quot;use&quot;</span><span class="o">,</span> <span class="s">&quot;case&quot;</span><span class="o">,</span> <span class="s">&quot;classes&quot;</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">&quot;Logistic&quot;</span><span class="o">,</span> <span class="s">&quot;regression&quot;</span><span class="o">,</span> <span class="s">&quot;models&quot;</span><span class="o">,</span> <span class="s">&quot;are&quot;</span><span class="o">,</span> <span class="s">&quot;neat&quot;</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">wordDataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">NGram</span> <span class="n">ngramTransformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">NGram</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;words&quot;</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;ngrams&quot;</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">ngramDataFrame</span> <span class="o">=</span> <span class="n">ngramTransformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">wordDataFrame</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">ngramDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;ngrams&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="na">take</span><span class="o">(</span><span class="mi">3</span><span class="o">))</span> <span class="o">{</span>
+ <span class="n">java</span><span class="o">.</span><span class="na">util</span><span class="o">.</span><span class="na">List</span><span class="o">&lt;</span><span class="n">String</span><span class="o">&gt;</span> <span class="n">ngrams</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getList</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+ <span class="k">for</span> <span class="o">(</span><span class="n">String</span> <span class="n">ngram</span> <span class="o">:</span> <span class="n">ngrams</span><span class="o">)</span> <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="n">ngram</span> <span class="o">+</span> <span class="s">&quot; --- &quot;</span><span class="o">);</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">();</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p><a href="api/python/pyspark.ml.html#pyspark.ml.feature.NGram"><code>NGram</code></a> takes an input column name, an output column name, and an optional length parameter n (n=2 by default).</p>
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">NGram</span>
+
+<span class="n">wordDataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
+ <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s">&quot;Hi&quot;</span><span class="p">,</span> <span class="s">&quot;I&quot;</span><span class="p">,</span> <span class="s">&quot;heard&quot;</span><span class="p">,</span> <span class="s">&quot;about&quot;</span><span class="p">,</span> <span class="s">&quot;Spark&quot;</span><span class="p">]),</span>
+ <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s">&quot;I&quot;</span><span class="p">,</span> <span class="s">&quot;wish&quot;</span><span class="p">,</span> <span class="s">&quot;Java&quot;</span><span class="p">,</span> <span class="s">&quot;could&quot;</span><span class="p">,</span> <span class="s">&quot;use&quot;</span><span class="p">,</span> <span class="s">&quot;case&quot;</span><span class="p">,</span> <span class="s">&quot;classes&quot;</span><span class="p">]),</span>
+ <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="s">&quot;Logistic&quot;</span><span class="p">,</span> <span class="s">&quot;regression&quot;</span><span class="p">,</span> <span class="s">&quot;models&quot;</span><span class="p">,</span> <span class="s">&quot;are&quot;</span><span class="p">,</span> <span class="s">&quot;neat&quot;</span><span class="p">])</span>
+<span class="p">],</span> <span class="p">[</span><span class="s">&quot;label&quot;</span><span class="p">,</span> <span class="s">&quot;words&quot;</span><span class="p">])</span>
+<span class="n">ngram</span> <span class="o">=</span> <span class="n">NGram</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;words&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;ngrams&quot;</span><span class="p">)</span>
+<span class="n">ngramDataFrame</span> <span class="o">=</span> <span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">wordDataFrame</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">ngrams_label</span> <span class="ow">in</span> <span class="n">ngramDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;ngrams&quot;</span><span class="p">,</span> <span class="s">&quot;label&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">ngrams_label</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="binarizer">Binarizer</h2>
+
+<p>Binarization is the process of thresholding numerical features to binary features. As some probabilistic estimators make assumption that the input data is distributed according to <a href="http://en.wikipedia.org/wiki/Bernoulli_distribution">Bernoulli distribution</a>, a binarizer is useful for pre-processing the input data with continuous numerical features.</p>
+
+<p>A simple <a href="api/scala/index.html#org.apache.spark.ml.feature.Binarizer">Binarizer</a> class provides this functionality. Besides the common parameters of <code>inputCol</code> and <code>outputCol</code>, <code>Binarizer</code> has the parameter <code>threshold</code> used for binarizing continuous numerical features. The features greater than the threshold, will be binarized to 1.0. The features equal to or less than the threshold, will be binarized to 0.0. The example below shows how to binarize numerical features.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">0.8</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">)</span>
+<span class="o">)</span>
+<span class="k">val</span> <span class="n">dataFrame</span><span class="k">:</span> <span class="kt">DataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="s">&quot;feature&quot;</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">binarizer</span><span class="k">:</span> <span class="kt">Binarizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Binarizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;feature&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;binarized_feature&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setThreshold</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">binarizedDataFrame</span> <span class="k">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">binarizedFeatures</span> <span class="k">=</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;binarized_feature&quot;</span><span class="o">)</span>
+<span class="n">binarizedFeatures</span><span class="o">.</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">0.8</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;feature&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">continuousDataFrame</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">Binarizer</span> <span class="n">binarizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Binarizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;feature&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;binarized_feature&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setThreshold</span><span class="o">(</span><span class="mf">0.5</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">binarizedDataFrame</span> <span class="o">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">continuousDataFrame</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">binarizedFeatures</span> <span class="o">=</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;binarized_feature&quot;</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">binarizedFeatures</span><span class="o">.</span><span class="na">collect</span><span class="o">())</span> <span class="o">{</span>
+ <span class="n">Double</span> <span class="n">binarized_value</span> <span class="o">=</span> <span class="n">r</span><span class="o">.</span><span class="na">getDouble</span><span class="o">(</span><span class="mi">0</span><span class="o">);</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">binarized_value</span><span class="o">);</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Binarizer</span>
+
+<span class="n">continuousDataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
+ <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">0.8</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">)</span>
+<span class="p">],</span> <span class="p">[</span><span class="s">&quot;label&quot;</span><span class="p">,</span> <span class="s">&quot;feature&quot;</span><span class="p">])</span>
+<span class="n">binarizer</span> <span class="o">=</span> <span class="n">Binarizer</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">0.5</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;feature&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;binarized_feature&quot;</span><span class="p">)</span>
+<span class="n">binarizedDataFrame</span> <span class="o">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">continuousDataFrame</span><span class="p">)</span>
+<span class="n">binarizedFeatures</span> <span class="o">=</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;binarized_feature&quot;</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">binarized_feature</span><span class="p">,</span> <span class="ow">in</span> <span class="n">binarizedFeatures</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">binarized_feature</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="pca">PCA</h2>
+
+<p><a href="http://en.wikipedia.org/wiki/Principal_component_analysis">PCA</a> is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A <a href="api/scala/index.html#org.apache.spark.ml.feature.PCA">PCA</a> class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+ <p>See the <a href="api/scala/index.html#org.apache.spark.ml.feature.PCA">Scala API documentation</a> for API details.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.PCA</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">((</span><span class="mi">1</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">))),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">)</span>
+<span class="o">)</span>
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">pca</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">PCA</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;pcaFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setK</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">pcaDF</span> <span class="k">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">pcaDF</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;pcaFeatures&quot;</span><span class="o">)</span>
+<span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+ <p>See the <a href="api/java/org/apache/spark/ml/feature/PCA.html">Java API documentation</a> for API details.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaSparkContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.PCA</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.PCAModel</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.VectorUDT</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.SQLContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaSparkContext</span> <span class="n">jsc</span> <span class="o">=</span> <span class="o">...</span>
+<span class="n">SQLContext</span> <span class="n">jsql</span> <span class="o">=</span> <span class="o">...</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">sparse</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="k">new</span> <span class="kt">int</span><span class="o">[]{</span><span class="mi">1</span><span class="o">,</span> <span class="mi">3</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">})),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">,</span> <span class="mf">7.0</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">PCAModel</span> <span class="n">pca</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">PCA</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;pcaFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setK</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">result</span> <span class="o">=</span> <span class="n">pca</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;pcaFeatures&quot;</span><span class="o">);</span>
+<span class="n">result</span><span class="o">.</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+ <p>See the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.PCA">Python API documentation</a> for API details.</p>
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">PCA</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">)]),),</span>
+ <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">,</span> <span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">]),),</span>
+ <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">,</span> <span class="mf">7.0</span><span class="p">]),)]</span>
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,[</span><span class="s">&quot;features&quot;</span><span class="p">])</span>
+<span class="n">pca</span> <span class="o">=</span> <span class="n">PCA</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;pcaFeatures&quot;</span><span class="p">)</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">pca</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
+<span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;pcaFeatures&quot;</span><span class="p">)</span>
+<span class="n">result</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="polynomialexpansion">PolynomialExpansion</h2>
+
+<p><a href="http://en.wikipedia.org/wiki/Polynomial_expansion">Polynomial expansion</a> is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A <a href="api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion">PolynomialExpansion</a> class provides this functionality. The example below shows how to expand your features into a 3-degree polynomial space.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.PolynomialExpansion</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="o">)</span>
+<span class="o">)</span>
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">polynomialExpansion</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">PolynomialExpansion</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;polyFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setDegree</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">polyDF</span> <span class="k">=</span> <span class="n">polynomialExpansion</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+<span class="n">polyDF</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;polyFeatures&quot;</span><span class="o">).</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaSparkContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.VectorUDT</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.SQLContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaSparkContext</span> <span class="n">jsc</span> <span class="o">=</span> <span class="o">...</span>
+<span class="n">SQLContext</span> <span class="n">jsql</span> <span class="o">=</span> <span class="o">...</span>
+<span class="n">PolynomialExpansion</span> <span class="n">polyExpansion</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">PolynomialExpansion</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;polyFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setDegree</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">polyDF</span> <span class="o">=</span> <span class="n">polyExpansion</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+<span class="n">Row</span><span class="o">[]</span> <span class="n">row</span> <span class="o">=</span> <span class="n">polyDF</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;polyFeatures&quot;</span><span class="o">).</span><span class="na">take</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Row</span> <span class="n">r</span> <span class="o">:</span> <span class="n">row</span><span class="o">)</span> <span class="o">{</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">r</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="mi">0</span><span class="o">));</span>
+<span class="o">}</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">PolynomialExpansion</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
+
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
+ <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="o">-</span><span class="mf">2.0</span><span class="p">,</span> <span class="mf">2.3</span><span class="p">]),</span> <span class="p">),</span>
+ <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">]),</span> <span class="p">),</span>
+ <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.6</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.1</span><span class="p">]),</span> <span class="p">)],</span>
+ <span class="p">[</span><span class="s">&quot;features&quot;</span><span class="p">])</span>
+<span class="n">px</span> <span class="o">=</span> <span class="n">PolynomialExpansion</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;polyFeatures&quot;</span><span class="p">)</span>
+<span class="n">polyDF</span> <span class="o">=</span> <span class="n">px</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
+<span class="k">for</span> <span class="n">expanded</span> <span class="ow">in</span> <span class="n">polyDF</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;polyFeatures&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">3</span><span class="p">):</span>
+ <span class="k">print</span><span class="p">(</span><span class="n">expanded</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="discrete-cosine-transform-dct">Discrete Cosine Transform (DCT)</h2>
+
+<p>The <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform">Discrete Cosine
+Transform</a>
+transforms a length $N$ real-valued sequence in the time domain into
+another length $N$ real-valued sequence in the frequency domain. A
+<a href="api/scala/index.html#org.apache.spark.ml.feature.DCT">DCT</a> class
+provides this functionality, implementing the
+<a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II">DCT-II</a>
+and scaling the result by $1/\sqrt{2}$ such that the representing matrix
+for the transform is unitary. No shift is applied to the transformed
+sequence (e.g. the $0$th element of the transformed sequence is the
+$0$th DCT coefficient and <em>not</em> the $N/2$th).</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.DCT</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Seq</span><span class="o">(</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(-</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">7.0</span><span class="o">),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">14.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">5.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">))</span>
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dct</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">DCT</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;featuresDCT&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setInverse</span><span class="o">(</span><span class="kc">false</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dctDf</span> <span class="k">=</span> <span class="n">dct</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+<span class="n">dctDf</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;featuresDCT&quot;</span><span class="o">).</span><span class="n">show</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaSparkContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.DCT</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.VectorUDT</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.SQLContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(-</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">4.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">7.0</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">14.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">,</span> <span class="o">-</span><span class="mf">5.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">DCT</span> <span class="n">dct</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">DCT</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;featuresDCT&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setInverse</span><span class="o">(</span><span class="kc">false</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">dctDf</span> <span class="o">=</span> <span class="n">dct</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+<span class="n">dctDf</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;featuresDCT&quot;</span><span class="o">).</span><span class="na">show</span><span class="o">(</span><span class="mi">3</span><span class="o">);</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="stringindexer">StringIndexer</h2>
+
+<p><code>StringIndexer</code> encodes a string column of labels to a column of label indices.
+The indices are in <code>[0, numLabels)</code>, ordered by label frequencies.
+So the most frequent label gets index <code>0</code>.
+If the input column is numeric, we cast it to string and index the string
+values. When downstream pipeline components such as <code>Estimator</code> or
+<code>Transformer</code> make use of this string-indexed label, you must set the input
+column of the component to this string-indexed column name. In many cases,
+you can set the input column with <code>setInputCol</code>.</p>
+
+<p><strong>Examples</strong></p>
+
+<p>Assume that we have the following DataFrame with columns <code>id</code> and <code>category</code>:</p>
+
+<pre><code> id | category
+----|----------
+ 0 | a
+ 1 | b
+ 2 | c
+ 3 | a
+ 4 | a
+ 5 | c
+</code></pre>
+
+<p><code>category</code> is a string column with three labels: &#8220;a&#8221;, &#8220;b&#8221;, and &#8220;c&#8221;.
+Applying <code>StringIndexer</code> with <code>category</code> as the input column and <code>categoryIndex</code> as the output
+column, we should get the following:</p>
+
+<pre><code> id | category | categoryIndex
+----|----------|---------------
+ 0 | a | 0.0
+ 1 | b | 2.0
+ 2 | c | 1.0
+ 3 | a | 0.0
+ 4 | a | 0.0
+ 5 | c | 1.0
+</code></pre>
+
+<p>&#8220;a&#8221; gets index <code>0</code> because it is the most frequent, followed by &#8220;c&#8221; with index <code>1</code> and &#8220;b&#8221; with
+index <code>2</code>.</p>
+
+<div class="codetabs">
+
+<div data-lang="scala">
+
+ <p><a href="api/scala/index.html#org.apache.spark.ml.feature.StringIndexer"><code>StringIndexer</code></a> takes an input
+column name and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span>
+
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span>
+ <span class="nc">Seq</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span> <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">),</span> <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">),</span> <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span> <span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span> <span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">))</span>
+<span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;category&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StringIndexer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;category&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;categoryIndex&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">indexed</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+<span class="n">indexed</span><span class="o">.</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+ <p><a href="api/java/org/apache/spark/ml/feature/StringIndexer.html"><code>StringIndexer</code></a> takes an input column
+name and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;category&quot;</span><span class="o">,</span> <span class="n">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">)</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">StringIndexer</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StringIndexer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;category&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;categoryIndex&quot;</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">).</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+<span class="n">indexed</span><span class="o">.</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p><a href="api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer"><code>StringIndexer</code></a> takes an input
+column name and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StringIndexer</span>
+
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
+ <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">&quot;b&quot;</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">&quot;c&quot;</span><span class="p">),</span> <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">),</span> <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">),</span> <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="s">&quot;c&quot;</span><span class="p">)],</span>
+ <span class="p">[</span><span class="s">&quot;id&quot;</span><span class="p">,</span> <span class="s">&quot;category&quot;</span><span class="p">])</span>
+<span class="n">indexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;category&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;categoryIndex&quot;</span><span class="p">)</span>
+<span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
+<span class="n">indexed</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="onehotencoder">OneHotEncoder</h2>
+
+<p><a href="http://en.wikipedia.org/wiki/One-hot">One-hot encoding</a> maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features </p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">OneHotEncoder</span><span class="o">,</span> <span class="nc">StringIndexer</span><span class="o">}</span>
+
+<span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">)</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;category&quot;</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StringIndexer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;category&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;categoryIndex&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">indexed</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">df</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">encoder</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">OneHotEncoder</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;categoryIndex&quot;</span><span class="o">).</span>
+ <span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;categoryVec&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">encoded</span> <span class="k">=</span> <span class="n">encoder</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">indexed</span><span class="o">)</span>
+<span class="n">encoded</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;categoryVec&quot;</span><span class="o">).</span><span class="n">foreach</span><span class="o">(</span><span class="n">println</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.OneHotEncoder</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StringIndexerModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="s">&quot;b&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">4</span><span class="o">,</span> <span class="s">&quot;a&quot;</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">5</span><span class="o">,</span> <span class="s">&quot;c&quot;</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;category&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">StringIndexerModel</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StringIndexer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;category&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;categoryIndex&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">indexed</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">df</span><span class="o">);</span>
+
+<span class="n">OneHotEncoder</span> <span class="n">encoder</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">OneHotEncoder</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;categoryIndex&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;categoryVec&quot;</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">encoded</span> <span class="o">=</span> <span class="n">encoder</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">indexed</span><span class="o">);</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">OneHotEncoder</span><span class="p">,</span> <span class="n">StringIndexer</span>
+
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span>
+ <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">&quot;b&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">&quot;c&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">4</span><span class="p">,</span> <span class="s">&quot;a&quot;</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="s">&quot;c&quot;</span><span class="p">)</span>
+<span class="p">],</span> <span class="p">[</span><span class="s">&quot;id&quot;</span><span class="p">,</span> <span class="s">&quot;category&quot;</span><span class="p">])</span>
+
+<span class="n">stringIndexer</span> <span class="o">=</span> <span class="n">StringIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;category&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;categoryIndex&quot;</span><span class="p">)</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">stringIndexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
+<span class="n">indexed</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
+<span class="n">encoder</span> <span class="o">=</span> <span class="n">OneHotEncoder</span><span class="p">(</span><span class="n">includeFirst</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;categoryIndex&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;categoryVec&quot;</span><span class="p">)</span>
+<span class="n">encoded</span> <span class="o">=</span> <span class="n">encoder</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">indexed</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="vectorindexer">VectorIndexer</h2>
+
+<p><code>VectorIndexer</code> helps index categorical features in datasets of <code>Vector</code>s.
+It can both automatically decide which features are categorical and convert original values to category indices. Specifically, it does the following:</p>
+
+<ol>
+ <li>Take an input column of type <a href="api/scala/index.html#org.apache.spark.mllib.linalg.Vector">Vector</a> and a parameter <code>maxCategories</code>.</li>
+ <li>Decide which features should be categorical based on the number of distinct values, where features with at most <code>maxCategories</code> are declared categorical.</li>
+ <li>Compute 0-based category indices for each categorical feature.</li>
+ <li>Index categorical features and transform original feature values to indices.</li>
+</ol>
+
+<p>Indexing categorical features allows algorithms such as Decision Trees and Tree Ensembles to treat categorical features appropriately, improving performance.</p>
+
+<p>Please refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer">VectorIndexer API docs</a> for more details.</p>
+
+<p>In the example below, we read in a dataset of labeled points and then use <code>VectorIndexer</code> to decide which features should be treated as categorical. We transform the categorical feature values to their indices. This transformed data could then be passed to algorithms such as <code>DecisionTreeRegressor</code> that handle categorical features.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.VectorIndexer</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">).</span><span class="n">toDF</span><span class="o">()</span>
+<span class="k">val</span> <span class="n">indexer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">VectorIndexer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;indexed&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setMaxCategories</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">indexerModel</span> <span class="k">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">data</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">categoricalFeatures</span><span class="k">:</span> <span class="kt">Set</span><span class="o">[</span><span class="kt">Int</span><span class="o">]</span> <span class="k">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="n">categoryMaps</span><span class="o">.</span><span class="n">keys</span><span class="o">.</span><span class="n">toSet</span>
+<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">&quot;Chose ${categoricalFeatures.size} categorical features: &quot;</span> <span class="o">+</span>
+ <span class="n">categoricalFeatures</span><span class="o">.</span><span class="n">mkString</span><span class="o">(</span><span class="s">&quot;, &quot;</span><span class="o">))</span>
+
+<span class="c1">// Create new column &quot;indexed&quot; with categorical values transformed to indices</span>
+<span class="k">val</span> <span class="n">indexedData</span> <span class="k">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">data</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Map</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.VectorIndexer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.VectorIndexerModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">rdd</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span>
+ <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+<span class="n">DataFrame</span> <span class="n">data</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rdd</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
+<span class="n">VectorIndexer</span> <span class="n">indexer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">VectorIndexer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;indexed&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setMaxCategories</span><span class="o">(</span><span class="mi">10</span><span class="o">);</span>
+<span class="n">VectorIndexerModel</span> <span class="n">indexerModel</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">data</span><span class="o">);</span>
+<span class="n">Map</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Map</span><span class="o">&lt;</span><span class="n">Double</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;&gt;</span> <span class="n">categoryMaps</span> <span class="o">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="na">javaCategoryMaps</span><span class="o">();</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="s">&quot;Chose &quot;</span> <span class="o">+</span> <span class="n">categoryMaps</span><span class="o">.</span><span class="na">size</span><span class="o">()</span> <span class="o">+</span> <span class="s">&quot;categorical features:&quot;</span><span class="o">);</span>
+<span class="k">for</span> <span class="o">(</span><span class="n">Integer</span> <span class="n">feature</span> <span class="o">:</span> <span class="n">categoryMaps</span><span class="o">.</span><span class="na">keySet</span><span class="o">())</span> <span class="o">{</span>
+ <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="s">&quot; &quot;</span> <span class="o">+</span> <span class="n">feature</span><span class="o">);</span>
+<span class="o">}</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">();</span>
+
+<span class="c1">// Create new column &quot;indexed&quot; with categorical values transformed to indices</span>
+<span class="n">DataFrame</span> <span class="n">indexedData</span> <span class="o">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">data</span><span class="o">);</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">VectorIndexer</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">toDF</span><span class="p">()</span>
+<span class="n">indexer</span> <span class="o">=</span> <span class="n">VectorIndexer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;indexed&quot;</span><span class="p">,</span> <span class="n">maxCategories</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
+<span class="n">indexerModel</span> <span class="o">=</span> <span class="n">indexer</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
+
+<span class="c"># Create new column &quot;indexed&quot; with categorical values transformed to indices</span>
+<span class="n">indexedData</span> <span class="o">=</span> <span class="n">indexerModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">data</span><span class="p">)</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="normalizer">Normalizer</h2>
+
+<p><code>Normalizer</code> is a <code>Transformer</code> which transforms a dataset of <code>Vector</code> rows, normalizing each <code>Vector</code> to have unit norm. It takes parameter <code>p</code>, which specifies the <a href="http://en.wikipedia.org/wiki/Norm_%28mathematics%29#p-norm">p-norm</a> used for normalization. ($p = 2$ by default.) This normalization can help standardize your input data and improve the behavior of learning algorithms.</p>
+
+<p>The following example demonstrates how to load a dataset in libsvm format and then normalize each row to have unit $L^2$ norm and unit $L^\infty$ norm.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Normalizer</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">)</span>
+
+<span class="c1">// Normalize each Vector using $L^1$ norm.</span>
+<span class="k">val</span> <span class="n">normalizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Normalizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;normFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setP</span><span class="o">(</span><span class="mf">1.0</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">l1NormData</span> <span class="k">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span>
+
+<span class="c1">// Normalize each Vector using $L^\infty$ norm.</span>
+<span class="k">val</span> <span class="n">lInfNormData</span> <span class="k">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">,</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">p</span> <span class="o">-&gt;</span> <span class="nc">Double</span><span class="o">.</span><span class="nc">PositiveInfinity</span><span class="o">)</span></code></pre></div>
+
+</div>
+
+<div data-lang="java">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Normalizer</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span>
+ <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">jsc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+<span class="n">DataFrame</span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
+
+<span class="c1">// Normalize each Vector using $L^1$ norm.</span>
+<span class="n">Normalizer</span> <span class="n">normalizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Normalizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;normFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setP</span><span class="o">(</span><span class="mf">1.0</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">l1NormData</span> <span class="o">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span>
+
+<span class="c1">// Normalize each Vector using $L^\infty$ norm.</span>
+<span class="n">DataFrame</span> <span class="n">lInfNormData</span> <span class="o">=</span>
+ <span class="n">normalizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">,</span> <span class="n">normalizer</span><span class="o">.</span><span class="na">p</span><span class="o">().</span><span class="na">w</span><span class="o">(</span><span class="n">Double</span><span class="o">.</span><span class="na">POSITIVE_INFINITY</span><span class="o">));</span></code></pre></div>
+
+</div>
+
+<div data-lang="python">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span>
+<span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Normalizer</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="p">)</span>
+<span class="n">dataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
+
+<span class="c"># Normalize each Vector using $L^1$ norm.</span>
+<span class="n">normalizer</span> <span class="o">=</span> <span class="n">Normalizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;normFeatures&quot;</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="mf">1.0</span><span class="p">)</span>
+<span class="n">l1NormData</span> <span class="o">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span>
+
+<span class="c"># Normalize each Vector using $L^\infty$ norm.</span>
+<span class="n">lInfNormData</span> <span class="o">=</span> <span class="n">normalizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">,</span> <span class="p">{</span><span class="n">normalizer</span><span class="o">.</span><span class="n">p</span><span class="p">:</span> <span class="nb">float</span><span class="p">(</span><span class="s">&quot;inf&quot;</span><span class="p">)})</span></code></pre></div>
+
+</div>
+</div>
+
+<h2 id="standardscaler">StandardScaler</h2>
+
+<p><code>StandardScaler</code> transforms a dataset of <code>Vector</code> rows, normalizing each feature to have unit standard deviation and/or zero mean. It takes parameters:</p>
+
+<ul>
+ <li><code>withStd</code>: True by default. Scales the data to unit standard deviation.</li>
+ <li><code>withMean</code>: False by default. Centers the data with mean before scaling. It will build a dense output, so this does not work on sparse input and will raise an exception.</li>
+</ul>
+
+<p><code>StandardScaler</code> is a <code>Model</code> which can be <code>fit</code> on a dataset to produce a <code>StandardScalerModel</code>; this amounts to computing summary statistics. The model can then transform a <code>Vector</code> column in a dataset to have unit standard deviation and/or zero mean features.</p>
+
+<p>Note that if the standard deviation of a feature is zero, it will return default <code>0.0</code> value in the <code>Vector</code> for that feature.</p>
+
+<p>More details can be found in the API docs for
+<a href="api/scala/index.html#org.apache.spark.ml.feature.StandardScaler">StandardScaler</a> and
+<a href="api/scala/index.html#org.apache.spark.ml.feature.StandardScalerModel">StandardScalerModel</a>.</p>
+
+<p>The following example demonstrates how to load a dataset in libsvm format and then normalize each feature to have unit standard deviation.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StandardScaler</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StandardScaler</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;scaledFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setWithStd</span><span class="o">(</span><span class="kc">true</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setWithMean</span><span class="o">(</span><span class="kc">false</span><span class="o">)</span>
+
+<span class="c1">// Compute summary statistics by fitting the StandardScaler</span>
+<span class="k">val</span> <span class="n">scalerModel</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span>
+
+<span class="c1">// Normalize each feature to have unit standard deviation.</span>
+<span class="k">val</span> <span class="n">scaledData</span> <span class="k">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span></code></pre></div>
+
+</div>
+
+<div data-lang="java">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StandardScaler</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StandardScalerModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span>
+ <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">jsc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+<span class="n">DataFrame</span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
+<span class="n">StandardScaler</span> <span class="n">scaler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StandardScaler</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;scaledFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setWithStd</span><span class="o">(</span><span class="kc">true</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setWithMean</span><span class="o">(</span><span class="kc">false</span><span class="o">);</span>
+
+<span class="c1">// Compute summary statistics by fitting the StandardScaler</span>
+<span class="n">StandardScalerModel</span> <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span>
+
+<span class="c1">// Normalize each feature to have unit standard deviation.</span>
+<span class="n">DataFrame</span> <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span></code></pre></div>
+
+</div>
+
+<div data-lang="python">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span>
+<span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StandardScaler</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="p">)</span>
+<span class="n">dataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
+<span class="n">scaler</span> <span class="o">=</span> <span class="n">StandardScaler</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;scaledFeatures&quot;</span><span class="p">,</span>
+ <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">withMean</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span>
+
+<span class="c"># Compute summary statistics by fitting the StandardScaler</span>
+<span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span>
+
+<span class="c"># Normalize each feature to have unit standard deviation.</span>
+<span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span></code></pre></div>
+
+</div>
+</div>
+
+<h2 id="minmaxscaler">MinMaxScaler</h2>
+
+<p><code>MinMaxScaler</code> transforms a dataset of <code>Vector</code> rows, rescaling each feature to a specific range (often [0, 1]). It takes parameters:</p>
+
+<ul>
+ <li><code>min</code>: 0.0 by default. Lower bound after transformation, shared by all features.</li>
+ <li><code>max</code>: 1.0 by default. Upper bound after transformation, shared by all features.</li>
+</ul>
+
+<p><code>MinMaxScaler</code> computes summary statistics on a data set and produces a <code>MinMaxScalerModel</code>. The model can then transform each feature individually such that it is in the given range.</p>
+
+<p>The rescaled value for a feature E is calculated as,
+<code>\begin{equation}
+ Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
+\end{equation}</code>
+For the case <code>E_{max} == E_{min}</code>, <code>Rescaled(e_i) = 0.5 * (max + min)</code></p>
+
+<p>Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input.</p>
+
+<p>The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1].</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+ <p>More details can be found in the API docs for
+<a href="api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler">MinMaxScaler</a> and
+<a href="api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel">MinMaxScalerModel</a>.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.MinMaxScaler</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">MinMaxScaler</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;scaledFeatures&quot;</span><span class="o">)</span>
+
+<span class="c1">// Compute summary statistics and generate MinMaxScalerModel</span>
+<span class="k">val</span> <span class="n">scalerModel</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span>
+
+<span class="c1">// rescale each feature to range [min, max].</span>
+<span class="k">val</span> <span class="n">scaledData</span> <span class="k">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+ <p>More details can be found in the API docs for
+<a href="api/java/org/apache/spark/ml/feature/MinMaxScaler.html">MinMaxScaler</a> and
+<a href="api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html">MinMaxScalerModel</a>.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.MinMaxScaler</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.MinMaxScalerModel</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span>
+ <span class="n">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">jsc</span><span class="o">.</span><span class="na">sc</span><span class="o">(),</span> <span class="s">&quot;data/mllib/sample_libsvm_data.txt&quot;</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>
+<span class="n">DataFrame</span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">LabeledPoint</span><span class="o">.</span><span class="na">class</span><span class="o">);</span>
+<span class="n">MinMaxScaler</span> <span class="n">scaler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">MinMaxScaler</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;scaledFeatures&quot;</span><span class="o">);</span>
+
+<span class="c1">// Compute summary statistics and generate MinMaxScalerModel</span>
+<span class="n">MinMaxScalerModel</span> <span class="n">scalerModel</span> <span class="o">=</span> <span class="n">scaler</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span>
+
+<span class="c1">// rescale each feature to range [min, max].</span>
+<span class="n">DataFrame</span> <span class="n">scaledData</span> <span class="o">=</span> <span class="n">scalerModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="bucketizer">Bucketizer</h2>
+
+<p><code>Bucketizer</code> transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter:</p>
+
+<ul>
+ <li><code>splits</code>: Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. Splits should be strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; Otherwise, values outside the splits specified will be treated as errors. Two examples of <code>splits</code> are <code>Array(Double.NegativeInfinity, 0.0, 1.0, Double.PositiveInfinity)</code> and <code>Array(0.0, 1.0, 2.0)</code>.</li>
+</ul>
+
+<p>Note that if you have no idea of the upper bound and lower bound of the targeted column, you would better add the <code>Double.NegativeInfinity</code> and <code>Double.PositiveInfinity</code> as the bounds of your splits to prevent a potenial out of Bucketizer bounds exception.</p>
+
+<p>Note also that the splits that you provided have to be in strictly increasing order, i.e. <code>s0 &lt; s1 &lt; s2 &lt; ... &lt; sn</code>.</p>
+
+<p>More details can be found in the API docs for <a href="api/scala/index.html#org.apache.spark.ml.feature.Bucketizer">Bucketizer</a>.</p>
+
+<p>The following example demonstrates how to bucketize a column of <code>Double</code>s into another index-wised column.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Bucketizer</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span>
+
+<span class="k">val</span> <span class="n">splits</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span><span class="nc">Double</span><span class="o">.</span><span class="nc">NegativeInfinity</span><span class="o">,</span> <span class="o">-</span><span class="mf">0.5</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">,</span> <span class="nc">Double</span><span class="o">.</span><span class="nc">PositiveInfinity</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(-</span><span class="mf">0.5</span><span class="o">,</span> <span class="o">-</span><span class="mf">0.3</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="nc">Tuple1</span><span class="o">.</span><span class="n">apply</span><span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">bucketizer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">Bucketizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;bucketedFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setSplits</span><span class="o">(</span><span class="n">splits</span><span class="o">)</span>
+
+<span class="c1">// Transform original data into its bucket index.</span>
+<span class="k">val</span> <span class="n">bucketedData</span> <span class="k">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span></code></pre></div>
+
+</div>
+
+<div data-lang="java">
+
+<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="kt">double</span><span class="o">[]</span> <span class="n">splits</span> <span class="o">=</span> <span class="o">{</span><span class="n">Double</span><span class="o">.</span><span class="na">NEGATIVE_INFINITY</span><span class="o">,</span> <span class="o">-</span><span class="mf">0.5</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">,</span> <span class="n">Double</span><span class="o">.</span><span class="na">POSITIVE_INFINITY</span><span class="o">};</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(-</span><span class="mf">0.5</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(-</span><span class="mf">0.3</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.2</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span>
+<span class="o">});</span>
+<span class="n">DataFrame</span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="n">Bucketizer</span> <span class="n">bucketizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Bucketizer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;bucketedFeatures&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setSplits</span><span class="o">(</span><span class="n">splits</span><span class="o">);</span>
+
+<span class="c1">// Transform original data into its bucket index.</span>
+<span class="n">DataFrame</span> <span class="n">bucketedData</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">);</span></code></pre></div>
+
+</div>
+
+<div data-lang="python">
+
+<div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Bucketizer</span>
+
+<span class="n">splits</span> <span class="o">=</span> <span class="p">[</span><span class="o">-</span><span class="nb">float</span><span class="p">(</span><span class="s">&quot;inf&quot;</span><span class="p">),</span> <span class="o">-</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="s">&quot;inf&quot;</span><span class="p">)]</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="o">-</span><span class="mf">0.5</span><span class="p">,),</span> <span class="p">(</span><span class="o">-</span><span class="mf">0.3</span><span class="p">,),</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,),</span> <span class="p">(</span><span class="mf">0.2</span><span class="p">,)]</span>
+<span class="n">dataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">&quot;features&quot;</span><span class="p">])</span>
+
+<span class="n">bucketizer</span> <span class="o">=</span> <span class="n">Bucketizer</span><span class="p">(</span><span class="n">splits</span><span class="o">=</span><span class="n">splits</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;bucketedFeatures&quot;</span><span class="p">)</span>
+
+<span class="c"># Transform original data into its bucket index.</span>
+<span class="n">bucketedData</span> <span class="o">=</span> <span class="n">bucketizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataFrame</span><span class="p">)</span></code></pre></div>
+
+</div>
+</div>
+
+<h2 id="elementwiseproduct">ElementwiseProduct</h2>
+
+<p>ElementwiseProduct multiplies each input vector by a provided &#8220;weight&#8221; vector, using element-wise multiplication. In other words, it scales each column of the dataset by a scalar multiplier. This represents the <a href="https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29">Hadamard product</a> between the input vector, <code>v</code> and transforming vector, <code>w</code>, to yield a result vector.</p>
+
+<p><code>\[ \begin{pmatrix}
+v_1 \\
+\vdots \\
+v_N
+\end{pmatrix} \circ \begin{pmatrix}
+ w_1 \\
+ \vdots \\
+ w_N
+ \end{pmatrix}
+= \begin{pmatrix}
+ v_1 w_1 \\
+ \vdots \\
+ v_N w_N
+ \end{pmatrix}
+\]</code></p>
+
+<p><a href="api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct"><code>ElementwiseProduct</code></a> takes the following parameter:</p>
+
+<ul>
+ <li><code>scalingVec</code>: the transforming vector.</li>
+</ul>
+
+<p>This example below demonstrates how to transform vectors using a transforming vector value.</p>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.ElementwiseProduct</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+
+<span class="c1">// Create some vector data; also works for sparse vectors</span>
+<span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">)),</span>
+ <span class="o">(</span><span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">)))).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;vector&quot;</span><span class="o">)</span>
+
+<span class="k">val</span> <span class="n">transformingVector</span> <span class="k">=</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">transformer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">ElementwiseProduct</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setScalingVec</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;vector&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;transformedVector&quot;</span><span class="o">)</span>
+
+<span class="c1">// Batch transform the vectors to create new column:</span>
+<span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">).</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.ElementwiseProduct</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vector</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.SQLContext</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.DataTypes</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.Metadata</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span>
+
+<span class="c1">// Create some vector data; also works for sparse vectors</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="s">&quot;a&quot;</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">,</span> <span class="mf">3.0</span><span class="o">)),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="s">&quot;b&quot;</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">4.0</span><span class="o">,</span> <span class="mf">5.0</span><span class="o">,</span> <span class="mf">6.0</span><span class="o">))</span>
+<span class="o">));</span>
+<span class="n">List</span><span class="o">&lt;</span><span class="n">StructField</span><span class="o">&gt;</span> <span class="n">fields</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o">&lt;</span><span class="n">StructField</span><span class="o">&gt;(</span><span class="mi">2</span><span class="o">);</span>
+<span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">));</span>
+<span class="n">fields</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructField</span><span class="o">(</span><span class="s">&quot;vector&quot;</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">));</span>
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createStructType</span><span class="o">(</span><span class="n">fields</span><span class="o">);</span>
+<span class="n">DataFrame</span> <span class="n">dataFrame</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+<span class="n">Vector</span> <span class="n">transformingVector</span> <span class="o">=</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="mf">2.0</span><span class="o">);</span>
+<span class="n">ElementwiseProduct</span> <span class="n">transformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">ElementwiseProduct</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setScalingVec</span><span class="o">(</span><span class="n">transformingVector</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;vector&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;transformedVector&quot;</span><span class="o">);</span>
+<span class="c1">// Batch transform the vectors to create new column:</span>
+<span class="n">transformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">).</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">ElementwiseProduct</span>
+<span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
+
+<span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">]),),</span> <span class="p">(</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">4.0</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">,</span> <span class="mf">6.0</span><span class="p">]),)]</span>
+<span class="n">df</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="p">[</span><span class="s">&quot;vector&quot;</span><span class="p">])</span>
+<span class="n">transformer</span> <span class="o">=</span> <span class="n">ElementwiseProduct</span><span class="p">(</span><span class="n">scalingVec</span><span class="o">=</span><span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">]),</span>
+ <span class="n">inputCol</span><span class="o">=</span><span class="s">&quot;vector&quot;</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;transformedVector&quot;</span><span class="p">)</span>
+<span class="n">transformer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">df</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>
+
+ </div>
+
+</div>
+
+<h2 id="vectorassembler">VectorAssembler</h2>
+
+<p><code>VectorAssembler</code> is a transformer that combines a given list of columns into a single vector
+column.
+It is useful for combining raw features and features generated by different feature transformers
+into a single feature vector, in order to train ML models like logistic regression and decision
+trees.
+<code>VectorAssembler</code> accepts the following input column types: all numeric types, boolean type,
+and vector type.
+In each row, the values of the input columns will be concatenated into a vector in the specified
+order.</p>
+
+<p><strong>Examples</strong></p>
+
+<p>Assume that we have a DataFrame with the columns <code>id</code>, <code>hour</code>, <code>mobile</code>, <code>userFeatures</code>,
+and <code>clicked</code>:</p>
+
+<pre><code> id | hour | mobile | userFeatures | clicked
+----|------|--------|------------------|---------
+ 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0
+</code></pre>
+
+<p><code>userFeatures</code> is a vector column that contains three user features.
+We want to combine <code>hour</code>, <code>mobile</code>, and <code>userFeatures</code> into a single feature vector
+called <code>features</code> and use it to predict <code>clicked</code> or not.
+If we set <code>VectorAssembler</code>&#8217;s input columns to <code>hour</code>, <code>mobile</code>, and <code>userFeatures</code> and
+output column to <code>features</code>, after transformation we should get the following DataFrame:</p>
+
+<pre><code> id | hour | mobile | userFeatures | clicked | features
+----|------|--------|------------------|---------|-----------------------------
+ 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5]
+</code></pre>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <p><a href="api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler"><code>VectorAssembler</code></a> takes an array
+of input column names and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.VectorAssembler</span>
+
+<span class="k">val</span> <span class="n">dataset</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span>
+ <span class="nc">Seq</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">10.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">),</span> <span class="mf">1.0</span><span class="o">))</span>
+<span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;hour&quot;</span><span class="o">,</span> <span class="s">&quot;mobile&quot;</span><span class="o">,</span> <span class="s">&quot;userFeatures&quot;</span><span class="o">,</span> <span class="s">&quot;clicked&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">assembler</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">VectorAssembler</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setInputCols</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="s">&quot;hour&quot;</span><span class="o">,</span> <span class="s">&quot;mobile&quot;</span><span class="o">,</span> <span class="s">&quot;userFeatures&quot;</span><span class="o">))</span>
+ <span class="o">.</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">output</span> <span class="k">=</span> <span class="n">assembler</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="s">&quot;clicked&quot;</span><span class="o">).</span><span class="n">first</span><span class="o">())</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p><a href="api/java/org/apache/spark/ml/feature/VectorAssembler.html"><code>VectorAssembler</code></a> takes an array
+of input column names and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.VectorUDT</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span>
+
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">createStructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;hour&quot;</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;mobile&quot;</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">,</span> <span class="k">new</span> <span class="nf">VectorUDT</span><span class="o">(),</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;clicked&quot;</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">)</span>
+<span class="o">});</span>
+<span class="n">Row</span> <span class="n">row</span> <span class="o">=</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="mf">10.0</span><span class="o">,</span> <span class="mf">0.5</span><span class="o">),</span> <span class="mf">1.0</span><span class="o">);</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">rdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="n">row</span><span class="o">));</span>
+<span class="n">DataFrame</span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="n">VectorAssembler</span> <span class="n">assembler</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">VectorAssembler</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCols</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]</span> <span class="o">{</span><span class="s">&quot;hour&quot;</span><span class="o">,</span> <span class="s">&quot;mobile&quot;</span><span class="o">,</span> <span class="s">&quot;userFeatures&quot;</span><span class="o">})</span>
+ <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">);</span>
+
+<span class="n">DataFrame</span> <span class="n">output</span> <span class="o">=</span> <span class="n">assembler</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">);</span>
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="s">&quot;clicked&quot;</span><span class="o">).</span><span class="na">first</span><span class="o">());</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p><a href="api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler"><code>VectorAssembler</code></a> takes a list
+of input column names and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
+<span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">VectorAssembler</span>
+
+<span class="n">dataset</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
+ <span class="p">[(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="n">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">]),</span> <span class="mf">1.0</span><span class="p">)],</span>
+ <span class="p">[</span><span class="s">&quot;id&quot;</span><span class="p">,</span> <span class="s">&quot;hour&quot;</span><span class="p">,</span> <span class="s">&quot;mobile&quot;</span><span class="p">,</span> <span class="s">&quot;userFeatures&quot;</span><span class="p">,</span> <span class="s">&quot;clicked&quot;</span><span class="p">])</span>
+<span class="n">assembler</span> <span class="o">=</span> <span class="n">VectorAssembler</span><span class="p">(</span>
+ <span class="n">inputCols</span><span class="o">=</span><span class="p">[</span><span class="s">&quot;hour&quot;</span><span class="p">,</span> <span class="s">&quot;mobile&quot;</span><span class="p">,</span> <span class="s">&quot;userFeatures&quot;</span><span class="p">],</span>
+ <span class="n">outputCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">)</span>
+<span class="n">output</span> <span class="o">=</span> <span class="n">assembler</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
+<span class="k">print</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="s">&quot;clicked&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">first</span><span class="p">())</span></code></pre></div>
+
+ </div>
+</div>
+
+<h1 id="feature-selectors">Feature Selectors</h1>
+
+<h2 id="vectorslicer">VectorSlicer</h2>
+
+<p><code>VectorSlicer</code> is a transformer that takes a feature vector and outputs a new feature vector with a
+sub-array of the original features. It is useful for extracting features from a vector column.</p>
+
+<p><code>VectorSlicer</code> accepts a vector column with a specified indices, then outputs a new vector column
+whose values are selected via those indices. There are two types of indices, </p>
+
+<ol>
+ <li>
+ <p>Integer indices that represents the indices into the vector, <code>setIndices()</code>;</p>
+ </li>
+ <li>
+ <p>String indices that represents the names of features into the vector, <code>setNames()</code>.
+ <em>This requires the vector column to have an <code>AttributeGroup</code> since the implementation matches on
+ the name field of an <code>Attribute</code>.</em></p>
+ </li>
+</ol>
+
+<p>Specification by integer and string are both acceptable. Moreover, you can use integer index and
+string name simultaneously. At least one feature must be selected. Duplicate features are not
+allowed, so there can be no overlap between selected indices and names. Note that if names of
+features are selected, an exception will be threw out when encountering with empty input attributes.</p>
+
+<p>The output vector will order features with the selected indices first (in the order given),
+followed by the selected names (in the order given).</p>
+
+<p><strong>Examples</strong></p>
+
+<p>Suppose that we have a DataFrame with the column <code>userFeatures</code>:</p>
+
+<pre><code> userFeatures
+------------------
+ [0.0, 10.0, 0.5]
+</code></pre>
+
+<p><code>userFeatures</code> is a vector column that contains three user features. Assuming that the first column
+of <code>userFeatures</code> are all zeros, so we want to remove it and only the last two columns are selected.
+The <code>VectorSlicer</code> selects the last two elements with <code>setIndices(1, 2)</code> then produces a new vector
+column named <code>features</code>:</p>
+
+<pre><code> userFeatures | features
+------------------|-----------------------------
+ [0.0, 10.0, 0.5] | [10.0, 0.5]
+</code></pre>
+
+<p>Suppose also that we have a potential input attributes for the <code>userFeatures</code>, i.e.
+<code>["f1", "f2", "f3"]</code>, then we can use <code>setNames("f2", "f3")</code> to select them.</p>
+
+<pre><code> userFeatures | features
+------------------|-----------------------------
+ [0.0, 10.0, 0.5] | [10.0, 0.5]
+ ["f1", "f2", "f3"] | ["f2", "f3"]
+</code></pre>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <p><a href="api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer"><code>VectorSlicer</code></a> takes an input
+column name with specified indices or names and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.ml.attribute.</span><span class="o">{</span><span class="nc">Attribute</span><span class="o">,</span> <span class="nc">AttributeGroup</span><span class="o">,</span> <span class="nc">NumericAttribute</span><span class="o">}</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.VectorSlicer</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span>
+<span class="k">import</span> <span class="nn">org.apache.spark.sql.</span><span class="o">{</span><span class="nc">DataFrame</span><span class="o">,</span> <span class="nc">Row</span><span class="o">,</span> <span class="nc">SQLContext</span><span class="o">}</span>
+
+<span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">sparse</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="nc">Seq</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="o">-</span><span class="mf">2.0</span><span class="o">),</span> <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">))),</span>
+ <span class="nc">Vectors</span><span class="o">.</span><span class="n">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)</span>
+<span class="o">)</span>
+
+<span class="k">val</span> <span class="n">defaultAttr</span> <span class="k">=</span> <span class="nc">NumericAttribute</span><span class="o">.</span><span class="n">defaultAttr</span>
+<span class="k">val</span> <span class="n">attrs</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">(</span><span class="s">&quot;f1&quot;</span><span class="o">,</span> <span class="s">&quot;f2&quot;</span><span class="o">,</span> <span class="s">&quot;f3&quot;</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="n">defaultAttr</span><span class="o">.</span><span class="n">withName</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">attrGroup</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">AttributeGroup</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">,</span> <span class="n">attrs</span><span class="o">.</span><span class="n">asInstanceOf</span><span class="o">[</span><span class="kt">Array</span><span class="o">[</span><span class="kt">Attribute</span><span class="o">]])</span>
+
+<span class="k">val</span> <span class="n">dataRDD</span> <span class="k">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">map</span><span class="o">(</span><span class="nc">Row</span><span class="o">.</span><span class="n">apply</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">dataset</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">dataRDD</span><span class="o">,</span> <span class="nc">StructType</span><span class="o">(</span><span class="n">attrGroup</span><span class="o">.</span><span class="n">toStructField</span><span class="o">()))</span>
+
+<span class="k">val</span> <span class="n">slicer</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">VectorSlicer</span><span class="o">().</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">).</span><span class="n">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+
+<span class="n">slicer</span><span class="o">.</span><span class="n">setIndices</span><span class="o">(</span><span class="mi">1</span><span class="o">).</span><span class="n">setNames</span><span class="o">(</span><span class="s">&quot;f3&quot;</span><span class="o">)</span>
+<span class="c1">// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array(&quot;f2&quot;, &quot;f3&quot;))</span>
+
+<span class="k">val</span> <span class="n">output</span> <span class="k">=</span> <span class="n">slicer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">)</span>
+<span class="n">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">,</span> <span class="s">&quot;features&quot;</span><span class="o">).</span><span class="n">first</span><span class="o">())</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p><a href="api/java/org/apache/spark/ml/feature/VectorSlicer.html"><code>VectorSlicer</code></a> takes an input column name
+with specified indices or names and an output column name.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span>
+
+<span class="n">Attribute</span><span class="o">[]</span> <span class="n">attrs</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Attribute</span><span class="o">[]{</span>
+ <span class="n">NumericAttribute</span><span class="o">.</span><span class="na">defaultAttr</span><span class="o">().</span><span class="na">withName</span><span class="o">(</span><span class="s">&quot;f1&quot;</span><span class="o">),</span>
+ <span class="n">NumericAttribute</span><span class="o">.</span><span class="na">defaultAttr</span><span class="o">().</span><span class="na">withName</span><span class="o">(</span><span class="s">&quot;f2&quot;</span><span class="o">),</span>
+ <span class="n">NumericAttribute</span><span class="o">.</span><span class="na">defaultAttr</span><span class="o">().</span><span class="na">withName</span><span class="o">(</span><span class="s">&quot;f3&quot;</span><span class="o">)</span>
+<span class="o">};</span>
+<span class="n">AttributeGroup</span> <span class="n">group</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">AttributeGroup</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">,</span> <span class="n">attrs</span><span class="o">);</span>
+
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">jrdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Lists</span><span class="o">.</span><span class="na">newArrayList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">sparse</span><span class="o">(</span><span class="mi">3</span><span class="o">,</span> <span class="k">new</span> <span class="kt">int</span><span class="o">[]{</span><span class="mi">0</span><span class="o">,</span> <span class="mi">1</span><span class="o">},</span> <span class="k">new</span> <span class="kt">double</span><span class="o">[]{-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">})),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Vectors</span><span class="o">.</span><span class="na">dense</span><span class="o">(-</span><span class="mf">2.0</span><span class="o">,</span> <span class="mf">2.3</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">))</span>
+<span class="o">));</span>
+
+<span class="n">DataFrame</span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">jsql</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">jrdd</span><span class="o">,</span> <span class="o">(</span><span class="k">new</span> <span class="nf">StructType</span><span class="o">()).</span><span class="na">add</span><span class="o">(</span><span class="n">group</span><span class="o">.</span><span class="na">toStructField</span><span class="o">()));</span>
+
+<span class="n">VectorSlicer</span> <span class="n">vectorSlicer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">VectorSlicer</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">);</span>
+
+<span class="n">vectorSlicer</span><span class="o">.</span><span class="na">setIndices</span><span class="o">(</span><span class="k">new</span> <span class="kt">int</span><span class="o">[]{</span><span class="mi">1</span><span class="o">}).</span><span class="na">setNames</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">&quot;f3&quot;</span><span class="o">});</span>
+<span class="c1">// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{&quot;f2&quot;, &quot;f3&quot;})</span>
+
+<span class="n">DataFrame</span> <span class="n">output</span> <span class="o">=</span> <span class="n">vectorSlicer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">);</span>
+
+<span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="n">output</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;userFeatures&quot;</span><span class="o">,</span> <span class="s">&quot;features&quot;</span><span class="o">).</span><span class="na">first</span><span class="o">());</span></code></pre></div>
+
+ </div>
+</div>
+
+<h2 id="rformula">RFormula</h2>
+
+<p><code>RFormula</code> selects columns specified by an <a href="https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html">R model formula</a>. It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.</p>
+
+<p><strong>Examples</strong></p>
+
+<p>Assume that we have a DataFrame with the columns <code>id</code>, <code>country</code>, <code>hour</code>, and <code>clicked</code>:</p>
+
+<pre><code>id | country | hour | clicked
+---|---------|------|---------
+ 7 | "US" | 18 | 1.0
+ 8 | "CA" | 12 | 0.0
+ 9 | "NZ" | 15 | 0.0
+</code></pre>
+
+<p>If we use <code>RFormula</code> with a formula string of <code>clicked ~ country + hour</code>, which indicates that we want to
+predict <code>clicked</code> based on <code>country</code> and <code>hour</code>, after transformation we should get the following DataFrame:</p>
+
+<pre><code>id | country | hour | clicked | features | label
+---|---------|------|---------|------------------|-------
+ 7 | "US" | 18 | 1.0 | [0.0, 0.0, 18.0] | 1.0
+ 8 | "CA" | 12 | 0.0 | [0.0, 1.0, 12.0] | 0.0
+ 9 | "NZ" | 15 | 0.0 | [1.0, 0.0, 15.0] | 0.0
+</code></pre>
+
+<div class="codetabs">
+<div data-lang="scala">
+
+ <p><a href="api/scala/index.html#org.apache.spark.ml.feature.RFormula"><code>RFormula</code></a> takes an R formula string, and optional parameters for the names of its output columns.</p>
+
+ <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.RFormula</span>
+
+<span class="k">val</span> <span class="n">dataset</span> <span class="k">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span>
+ <span class="o">(</span><span class="mi">7</span><span class="o">,</span> <span class="s">&quot;US&quot;</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">8</span><span class="o">,</span> <span class="s">&quot;CA&quot;</span><span class="o">,</span> <span class="mi">12</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span>
+ <span class="o">(</span><span class="mi">9</span><span class="o">,</span> <span class="s">&quot;NZ&quot;</span><span class="o">,</span> <span class="mi">15</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)</span>
+<span class="o">)).</span><span class="n">toDF</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="s">&quot;country&quot;</span><span class="o">,</span> <span class="s">&quot;hour&quot;</span><span class="o">,</span> <span class="s">&quot;clicked&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">formula</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">RFormula</span><span class="o">()</span>
+ <span class="o">.</span><span class="n">setFormula</span><span class="o">(</span><span class="s">&quot;clicked ~ country + hour&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setFeaturesCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="n">setLabelCol</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">output</span> <span class="k">=</span> <span class="n">formula</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataset</span><span class="o">).</span><span class="n">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">)</span>
+<span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="n">show</span><span class="o">()</span></code></pre></div>
+
+ </div>
+
+<div data-lang="java">
+
+ <p><a href="api/java/org/apache/spark/ml/feature/RFormula.html"><code>RFormula</code></a> takes an R formula string, and optional parameters for the names of its output columns.</p>
+
+ <div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span>
+
+<span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.RFormula</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.DataFrame</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.RowFactory</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.*</span><span class="o">;</span>
+<span class="kn">import</span> <span class="nn">static</span> <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">.</span><span class="na">types</span><span class="o">.</span><span class="na">DataTypes</span><span class="o">.*;</span>
+
+<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="n">createStructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]</span> <span class="o">{</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;id&quot;</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;country&quot;</span><span class="o">,</span> <span class="n">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;hour&quot;</span><span class="o">,</span> <span class="n">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">),</span>
+ <span class="n">createStructField</span><span class="o">(</span><span class="s">&quot;clicked&quot;</span><span class="o">,</span> <span class="n">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">)</span>
+<span class="o">});</span>
+<span class="n">JavaRDD</span><span class="o">&lt;</span><span class="n">Row</span><span class="o">&gt;</span> <span class="n">rdd</span> <span class="o">=</span> <span class="n">jsc</span><span class="o">.</span><span class="na">parallelize</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">7</span><span class="o">,</span> <span class="s">&quot;US&quot;</span><span class="o">,</span> <span class="mi">18</span><span class="o">,</span> <span class="mf">1.0</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">8</span><span class="o">,</span> <span class="s">&quot;CA&quot;</span><span class="o">,</span> <span class="mi">12</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">),</span>
+ <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">9</span><span class="o">,</span> <span class="s">&quot;NZ&quot;</span><span class="o">,</span> <span class="mi">15</span><span class="o">,</span> <span class="mf">0.0</span><span class="o">)</span>
+<span class="o">));</span>
+<span class="n">DataFrame</span> <span class="n">dataset</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">rdd</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span>
+
+<span class="n">RFormula</span> <span class="n">formula</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RFormula</span><span class="o">()</span>
+ <span class="o">.</span><span class="na">setFormula</span><span class="o">(</span><span class="s">&quot;clicked ~ country + hour&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setFeaturesCol</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">)</span>
+ <span class="o">.</span><span class="na">setLabelCol</span><span class="o">(</span><span class="s">&quot;label&quot;</span><span class="o">);</span>
+
+<span class="n">DataFrame</span> <span class="n">output</span> <span class="o">=</span> <span class="n">formula</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">dataset</span><span class="o">).</span><span class="na">transform</span><span class="o">(</span><span class="n">dataset</span><span class="o">);</span>
+<span class="n">output</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">&quot;features&quot;</span><span class="o">,</span> <span class="s">&quot;label&quot;</span><span class="o">).</span><span class="na">show</span><span class="o">();</span></code></pre></div>
+
+ </div>
+
+<div data-lang="python">
+
+ <p><a href="api/python/pyspark.ml.html#pyspark.ml.feature.RFormula"><code>RFormula</code></a> takes an R formula string, and optional parameters for the names of its output columns.</p>
+
+ <div class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">RFormula</span>
+
+<span class="n">dataset</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
+ <span class="p">[(</span><span class="mi">7</span><span class="p">,</span> <span class="s">&quot;US&quot;</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="s">&quot;CA&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">),</span>
+ <span class="p">(</span><span class="mi">9</span><span class="p">,</span> <span class="s">&quot;NZ&quot;</span><span class="p">,</span> <span class="mi">15</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">)],</span>
+ <span class="p">[</span><span class="s">&quot;id&quot;</span><span class="p">,</span> <span class="s">&quot;country&quot;</span><span class="p">,</span> <span class="s">&quot;hour&quot;</span><span class="p">,</span> <span class="s">&quot;clicked&quot;</span><span class="p">])</span>
+<span class="n">formula</span> <span class="o">=</span> <span class="n">RFormula</span><span class="p">(</span>
+ <span class="n">formula</span><span class="o">=</span><span class="s">&quot;clicked ~ country + hour&quot;</span><span class="p">,</span>
+ <span class="n">featuresCol</span><span class="o">=</span><span class="s">&quot;features&quot;</span><span class="p">,</span>
+ <span class="n">labelCol</span><span class="o">=</span><span class="s">&quot;label&quot;</span><span class="p">)</span>
+<span class="n">output</span> <span class="o">=</span> <span class="n">formula</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">dataset</span><span class="p">)</span>
+<span class="n">output</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">&quot;features&quot;</span><span class="p">,</span> <span class="s">&quot;label&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span></code></pre></div>
+
+ </div>
+</div>
+
+
+ </div> <!-- /container -->
+
+ <script src="js/vendor/jquery-1.8.0.min.js"></script>
+ <script src="js/vendor/bootstrap.min.js"></script>
+ <script src="js/vendor/anchor.min.js"></script>
+ <script src="js/main.js"></script>
+
+ <!-- MathJax Section -->
+ <script type="text/x-mathjax-config">
+ MathJax.Hub.Config({
+ TeX: { equationNumbers: { autoNumber: "AMS" } }
+ });
+ </script>
+ <script>
+ // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
+ // We could use "//cdn.mathjax...", but that won't support "file://".
+ (function(d, script) {
+ script = d.createElement('script');
+ script.type = 'text/javascript';
+ script.async = true;
+ script.onload = function(){
+ MathJax.Hub.Config({
+ tex2jax: {
+ inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+ displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+ processEscapes: true,
+ skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+ }
+ });
+ };
+ script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
+ 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+ d.getElementsByTagName('head')[0].appendChild(script);
+ }(document));
+ </script>
+ </body>
+</html>