<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>pyspark.ml.feature — PySpark 1.5.0 documentation</title>
<link rel="stylesheet" href="../../../_static/nature.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: '../../../',
VERSION: '1.5.0',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true
};
</script>
<script type="text/javascript" src="../../../_static/jquery.js"></script>
<script type="text/javascript" src="../../../_static/underscore.js"></script>
<script type="text/javascript" src="../../../_static/doctools.js"></script>
<link rel="top" title="PySpark 1.5.0 documentation" href="../../../index.html" />
<link rel="up" title="Module code" href="../../index.html" />
</head>
<body role="document">
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="nav-item nav-item-0"><a href="../../../index.html">PySpark 1.5.0 documentation</a> »</li>
<li class="nav-item nav-item-1"><a href="../../index.html" accesskey="U">Module code</a> »</li>
</ul>
</div>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<h1>Source code for pyspark.ml.feature</h1><div class="highlight"><pre>
<span class="c">#</span>
<span class="c"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c"># this work for additional information regarding copyright ownership.</span>
<span class="c"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c"># (the "License"); you may not use this file except in compliance with</span>
<span class="c"># the License. You may obtain a copy of the License at</span>
<span class="c">#</span>
<span class="c"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c">#</span>
<span class="c"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c"># distributed under the License is distributed on an "AS IS" BASIS,</span>
<span class="c"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c"># See the License for the specific language governing permissions and</span>
<span class="c"># limitations under the License.</span>
<span class="c">#</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="k">if</span> <span class="n">sys</span><span class="o">.</span><span class="n">version</span> <span class="o">></span> <span class="s">'3'</span><span class="p">:</span>
<span class="nb">basestring</span> <span class="o">=</span> <span class="nb">str</span>
<span class="kn">from</span> <span class="nn">pyspark.rdd</span> <span class="kn">import</span> <span class="n">ignore_unicode_prefix</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.param.shared</span> <span class="kn">import</span> <span class="o">*</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.util</span> <span class="kn">import</span> <span class="n">keyword_only</span>
<span class="kn">from</span> <span class="nn">pyspark.ml.wrapper</span> <span class="kn">import</span> <span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">JavaModel</span><span class="p">,</span> <span class="n">JavaTransformer</span>
<span class="kn">from</span> <span class="nn">pyspark.mllib.common</span> <span class="kn">import</span> <span class="n">inherit_doc</span>
<span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">_convert_to_vector</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s">'Binarizer'</span><span class="p">,</span> <span class="s">'Bucketizer'</span><span class="p">,</span> <span class="s">'ElementwiseProduct'</span><span class="p">,</span> <span class="s">'HashingTF'</span><span class="p">,</span> <span class="s">'IDF'</span><span class="p">,</span> <span class="s">'IDFModel'</span><span class="p">,</span>
<span class="s">'NGram'</span><span class="p">,</span> <span class="s">'Normalizer'</span><span class="p">,</span> <span class="s">'OneHotEncoder'</span><span class="p">,</span> <span class="s">'PolynomialExpansion'</span><span class="p">,</span> <span class="s">'RegexTokenizer'</span><span class="p">,</span>
<span class="s">'StandardScaler'</span><span class="p">,</span> <span class="s">'StandardScalerModel'</span><span class="p">,</span> <span class="s">'StringIndexer'</span><span class="p">,</span> <span class="s">'StringIndexerModel'</span><span class="p">,</span>
<span class="s">'Tokenizer'</span><span class="p">,</span> <span class="s">'VectorAssembler'</span><span class="p">,</span> <span class="s">'VectorIndexer'</span><span class="p">,</span> <span class="s">'Word2Vec'</span><span class="p">,</span> <span class="s">'Word2VecModel'</span><span class="p">,</span>
<span class="s">'PCA'</span><span class="p">,</span> <span class="s">'PCAModel'</span><span class="p">,</span> <span class="s">'RFormula'</span><span class="p">,</span> <span class="s">'RFormulaModel'</span><span class="p">]</span>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="Binarizer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Binarizer">[docs]</a><span class="k">class</span> <span class="nc">Binarizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Binarize a column of continuous features given a threshold.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(0.5,)], ["values"])</span>
<span class="sd"> >>> binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")</span>
<span class="sd"> >>> binarizer.transform(df).head().features</span>
<span class="sd"> 0.0</span>
<span class="sd"> >>> binarizer.setParams(outputCol="freqs").transform(df).head().freqs</span>
<span class="sd"> 0.0</span>
<span class="sd"> >>> params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}</span>
<span class="sd"> >>> binarizer.transform(df, params).head().vector</span>
<span class="sd"> 1.0</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">threshold</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"threshold"</span><span class="p">,</span>
<span class="s">"threshold in binary classification prediction, in range [0, 1]"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, threshold=0.0, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Binarizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.Binarizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">threshold</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"threshold"</span><span class="p">,</span>
<span class="s">"threshold in binary classification prediction, in range [0, 1]"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="Binarizer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Binarizer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, threshold=0.0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Binarizer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Binarizer.setThreshold"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Binarizer.setThreshold">[docs]</a> <span class="k">def</span> <span class="nf">setThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`threshold`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">threshold</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="Binarizer.getThreshold"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Binarizer.getThreshold">[docs]</a> <span class="k">def</span> <span class="nf">getThreshold</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of threshold or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">threshold</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="Bucketizer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Bucketizer">[docs]</a><span class="k">class</span> <span class="nc">Bucketizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Maps a column of continuous features to a column of feature buckets.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])</span>
<span class="sd"> >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],</span>
<span class="sd"> ... inputCol="values", outputCol="buckets")</span>
<span class="sd"> >>> bucketed = bucketizer.transform(df).collect()</span>
<span class="sd"> >>> bucketed[0].buckets</span>
<span class="sd"> 0.0</span>
<span class="sd"> >>> bucketed[1].buckets</span>
<span class="sd"> 0.0</span>
<span class="sd"> >>> bucketed[2].buckets</span>
<span class="sd"> 1.0</span>
<span class="sd"> >>> bucketed[3].buckets</span>
<span class="sd"> 2.0</span>
<span class="sd"> >>> bucketizer.setParams(outputCol="b").transform(df).head().b</span>
<span class="sd"> 0.0</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">splits</span> <span class="o">=</span> \
<span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"splits"</span><span class="p">,</span>
<span class="s">"Split points for mapping continuous features into buckets. With n+1 splits, "</span> <span class="o">+</span>
<span class="s">"there are n buckets. A bucket defined by splits x,y holds values in the "</span> <span class="o">+</span>
<span class="s">"range [x,y) except the last bucket, which also includes y. The splits "</span> <span class="o">+</span>
<span class="s">"should be strictly increasing. Values at -inf, inf must be explicitly "</span> <span class="o">+</span>
<span class="s">"provided to cover all Double values; otherwise, values outside the splits "</span> <span class="o">+</span>
<span class="s">"specified will be treated as errors."</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">splits</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, splits=None, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Bucketizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.Bucketizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="c">#: param for Splitting points for mapping continuous features into buckets. With n+1 splits,</span>
<span class="c"># there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)</span>
<span class="c"># except the last bucket, which also includes y. The splits should be strictly increasing.</span>
<span class="c"># Values at -inf, inf must be explicitly provided to cover all Double values; otherwise,</span>
<span class="c"># values outside the splits specified will be treated as errors.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">splits</span> <span class="o">=</span> \
<span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"splits"</span><span class="p">,</span>
<span class="s">"Split points for mapping continuous features into buckets. With n+1 splits, "</span> <span class="o">+</span>
<span class="s">"there are n buckets. A bucket defined by splits x,y holds values in the "</span> <span class="o">+</span>
<span class="s">"range [x,y) except the last bucket, which also includes y. The splits "</span> <span class="o">+</span>
<span class="s">"should be strictly increasing. Values at -inf, inf must be explicitly "</span> <span class="o">+</span>
<span class="s">"provided to cover all Double values; otherwise, values outside the splits "</span> <span class="o">+</span>
<span class="s">"specified will be treated as errors."</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="Bucketizer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Bucketizer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">splits</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, splits=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Bucketizer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Bucketizer.setSplits"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Bucketizer.setSplits">[docs]</a> <span class="k">def</span> <span class="nf">setSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`splits`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">splits</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="Bucketizer.getSplits"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Bucketizer.getSplits">[docs]</a> <span class="k">def</span> <span class="nf">getSplits</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of threshold or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">splits</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="ElementwiseProduct"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct">[docs]</a><span class="k">class</span> <span class="nc">ElementwiseProduct</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Outputs the Hadamard product (i.e., the element-wise product) of each input vector</span>
<span class="sd"> with a provided "weight" vector. In other words, it scales each column of the dataset</span>
<span class="sd"> by a scalar multiplier.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])</span>
<span class="sd"> >>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),</span>
<span class="sd"> ... inputCol="values", outputCol="eprod")</span>
<span class="sd"> >>> ep.transform(df).head().eprod</span>
<span class="sd"> DenseVector([2.0, 2.0, 9.0])</span>
<span class="sd"> >>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod</span>
<span class="sd"> DenseVector([4.0, 3.0, 15.0])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">scalingVec</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"scalingVec"</span><span class="p">,</span> <span class="s">"vector for hadamard product, "</span> <span class="o">+</span>
<span class="s">"it must be MLlib Vector type."</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">scalingVec</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, scalingVec=None, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">ElementwiseProduct</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.ElementwiseProduct"</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">scalingVec</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"scalingVec"</span><span class="p">,</span> <span class="s">"vector for hadamard product, "</span> <span class="o">+</span>
<span class="s">"it must be MLlib Vector type."</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="ElementwiseProduct.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">scalingVec</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, scalingVec=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this ElementwiseProduct.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="ElementwiseProduct.setScalingVec"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct.setScalingVec">[docs]</a> <span class="k">def</span> <span class="nf">setScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`scalingVec`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">scalingVec</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="ElementwiseProduct.getScalingVec"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct.getScalingVec">[docs]</a> <span class="k">def</span> <span class="nf">getScalingVec</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of scalingVec or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">scalingVec</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="HashingTF"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.HashingTF">[docs]</a><span class="k">class</span> <span class="nc">HashingTF</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">,</span> <span class="n">HasNumFeatures</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Maps a sequence of terms to their term frequencies using the</span>
<span class="sd"> hashing trick.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(["a", "b", "c"],)], ["words"])</span>
<span class="sd"> >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features")</span>
<span class="sd"> >>> hashingTF.transform(df).head().features</span>
<span class="sd"> SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0})</span>
<span class="sd"> >>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs</span>
<span class="sd"> SparseVector(10, {7: 1.0, 8: 1.0, 9: 1.0})</span>
<span class="sd"> >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}</span>
<span class="sd"> >>> hashingTF.transform(df, params).head().vector</span>
<span class="sd"> SparseVector(5, {2: 1.0, 3: 1.0, 4: 1.0})</span>
<span class="sd"> """</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">HashingTF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.HashingTF"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="HashingTF.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.HashingTF.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">1</span> <span class="o"><<</span> <span class="mi">18</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, numFeatures=1 << 18, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this HashingTF.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="IDF"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.IDF">[docs]</a><span class="k">class</span> <span class="nc">IDF</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Compute the Inverse Document Frequency (IDF) given a collection of documents.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import DenseVector</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),),</span>
<span class="sd"> ... (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])</span>
<span class="sd"> >>> idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")</span>
<span class="sd"> >>> idf.fit(df).transform(df).head().idf</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> >>> idf.setParams(outputCol="freqs").fit(df).transform(df).collect()[1].freqs</span>
<span class="sd"> DenseVector([0.0, 0.0])</span>
<span class="sd"> >>> params = {idf.minDocFreq: 1, idf.outputCol: "vector"}</span>
<span class="sd"> >>> idf.fit(df, params).transform(df).head().vector</span>
<span class="sd"> DenseVector([0.2877, 0.0])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">minDocFreq</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"minDocFreq"</span><span class="p">,</span>
<span class="s">"minimum of documents in which a term should appear for filtering"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, minDocFreq=0, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">IDF</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.IDF"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">minDocFreq</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"minDocFreq"</span><span class="p">,</span>
<span class="s">"minimum of documents in which a term should appear for filtering"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="IDF.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.IDF.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">minDocFreq</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, minDocFreq=0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this IDF.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="IDF.setMinDocFreq"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.IDF.setMinDocFreq">[docs]</a> <span class="k">def</span> <span class="nf">setMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`minDocFreq`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">minDocFreq</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="IDF.getMinDocFreq"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.IDF.getMinDocFreq">[docs]</a> <span class="k">def</span> <span class="nf">getMinDocFreq</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of minDocFreq or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minDocFreq</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">IDFModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="IDFModel"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.IDFModel">[docs]</a><span class="k">class</span> <span class="nc">IDFModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by IDF.</span>
<span class="sd"> """</span>
</div>
<span class="nd">@inherit_doc</span>
<span class="nd">@ignore_unicode_prefix</span>
<div class="viewcode-block" id="NGram"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.NGram">[docs]</a><span class="k">class</span> <span class="nc">NGram</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> A feature transformer that converts the input array of strings into an array of n-grams. Null</span>
<span class="sd"> values in the input array are ignored.</span>
<span class="sd"> It returns an array of n-grams where each n-gram is represented by a space-separated string of</span>
<span class="sd"> words.</span>
<span class="sd"> When the input is empty, an empty array is returned.</span>
<span class="sd"> When the input array length is less than n (number of elements per n-gram), no n-grams are</span>
<span class="sd"> returned.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])</span>
<span class="sd"> >>> ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams")</span>
<span class="sd"> >>> ngram.transform(df).head()</span>
<span class="sd"> Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b', u'b c', u'c d', u'd e'])</span>
<span class="sd"> >>> # Change n-gram length</span>
<span class="sd"> >>> ngram.setParams(n=4).transform(df).head()</span>
<span class="sd"> Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])</span>
<span class="sd"> >>> # Temporarily modify output column.</span>
<span class="sd"> >>> ngram.transform(df, {ngram.outputCol: "output"}).head()</span>
<span class="sd"> Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], output=[u'a b c d', u'b c d e'])</span>
<span class="sd"> >>> ngram.transform(df).head()</span>
<span class="sd"> Row(inputTokens=[u'a', u'b', u'c', u'd', u'e'], nGrams=[u'a b c d', u'b c d e'])</span>
<span class="sd"> >>> # Must use keyword arguments to specify params.</span>
<span class="sd"> >>> ngram.setParams("text")</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">n</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"n"</span><span class="p">,</span> <span class="s">"number of elements per n-gram (>=1)"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, n=2, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">NGram</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.NGram"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">n</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"n"</span><span class="p">,</span> <span class="s">"number of elements per n-gram (>=1)"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="NGram.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.NGram.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, n=2, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this NGram.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="NGram.setN"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.NGram.setN">[docs]</a> <span class="k">def</span> <span class="nf">setN</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`n`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="NGram.getN"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.NGram.getN">[docs]</a> <span class="k">def</span> <span class="nf">getN</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of n or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="Normalizer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Normalizer">[docs]</a><span class="k">class</span> <span class="nc">Normalizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Normalize a vector to have unit norm using the given p-norm.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span>
<span class="sd"> >>> svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"])</span>
<span class="sd"> >>> normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features")</span>
<span class="sd"> >>> normalizer.transform(df).head().features</span>
<span class="sd"> DenseVector([0.6, -0.8])</span>
<span class="sd"> >>> normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).head().freqs</span>
<span class="sd"> SparseVector(4, {1: 0.8, 3: 0.6})</span>
<span class="sd"> >>> params = {normalizer.p: 1.0, normalizer.inputCol: "dense", normalizer.outputCol: "vector"}</span>
<span class="sd"> >>> normalizer.transform(df, params).head().vector</span>
<span class="sd"> DenseVector([0.4286, -0.5714])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">p</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"p"</span><span class="p">,</span> <span class="s">"the p norm value."</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, p=2.0, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Normalizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.Normalizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">p</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"p"</span><span class="p">,</span> <span class="s">"the p norm value."</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="Normalizer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Normalizer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">p</span><span class="o">=</span><span class="mf">2.0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, p=2.0, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Normalizer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Normalizer.setP"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Normalizer.setP">[docs]</a> <span class="k">def</span> <span class="nf">setP</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`p`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">p</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="Normalizer.getP"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Normalizer.getP">[docs]</a> <span class="k">def</span> <span class="nf">getP</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of p or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">p</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="OneHotEncoder"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.OneHotEncoder">[docs]</a><span class="k">class</span> <span class="nc">OneHotEncoder</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> A one-hot encoder that maps a column of category indices to a</span>
<span class="sd"> column of binary vectors, with at most a single one-value per row</span>
<span class="sd"> that indicates the input category index.</span>
<span class="sd"> For example with 5 categories, an input value of 2.0 would map to</span>
<span class="sd"> an output vector of `[0.0, 0.0, 1.0, 0.0]`.</span>
<span class="sd"> The last category is not included by default (configurable via</span>
<span class="sd"> :py:attr:`dropLast`) because it makes the vector entries sum up to</span>
<span class="sd"> one, and hence linearly dependent.</span>
<span class="sd"> So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.</span>
<span class="sd"> Note that this is different from scikit-learn's OneHotEncoder,</span>
<span class="sd"> which keeps all categories.</span>
<span class="sd"> The output vectors are sparse.</span>
<span class="sd"> .. seealso::</span>
<span class="sd"> :py:class:`StringIndexer` for converting categorical values into</span>
<span class="sd"> category indices</span>
<span class="sd"> >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")</span>
<span class="sd"> >>> model = stringIndexer.fit(stringIndDf)</span>
<span class="sd"> >>> td = model.transform(stringIndDf)</span>
<span class="sd"> >>> encoder = OneHotEncoder(inputCol="indexed", outputCol="features")</span>
<span class="sd"> >>> encoder.transform(td).head().features</span>
<span class="sd"> SparseVector(2, {0: 1.0})</span>
<span class="sd"> >>> encoder.setParams(outputCol="freqs").transform(td).head().freqs</span>
<span class="sd"> SparseVector(2, {0: 1.0})</span>
<span class="sd"> >>> params = {encoder.dropLast: False, encoder.outputCol: "test"}</span>
<span class="sd"> >>> encoder.transform(td, params).head().test</span>
<span class="sd"> SparseVector(3, {0: 1.0})</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">dropLast</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"dropLast"</span><span class="p">,</span> <span class="s">"whether to drop the last category"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropLast</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, includeFirst=True, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">OneHotEncoder</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.OneHotEncoder"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">dropLast</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"dropLast"</span><span class="p">,</span> <span class="s">"whether to drop the last category"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">dropLast</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="OneHotEncoder.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.OneHotEncoder.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropLast</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, dropLast=True, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this OneHotEncoder.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="OneHotEncoder.setDropLast"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.OneHotEncoder.setDropLast">[docs]</a> <span class="k">def</span> <span class="nf">setDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`dropLast`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">dropLast</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="OneHotEncoder.getDropLast"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.OneHotEncoder.getDropLast">[docs]</a> <span class="k">def</span> <span class="nf">getDropLast</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of dropLast or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dropLast</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="PolynomialExpansion"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion">[docs]</a><span class="k">class</span> <span class="nc">PolynomialExpansion</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,</span>
<span class="sd"> which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an</span>
<span class="sd"> expansion of a product of sums expresses it as a sum of products by using the fact that</span>
<span class="sd"> multiplication distributes over addition". Take a 2-variable feature vector as an example:</span>
<span class="sd"> `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(Vectors.dense([0.5, 2.0]),)], ["dense"])</span>
<span class="sd"> >>> px = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded")</span>
<span class="sd"> >>> px.transform(df).head().expanded</span>
<span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="sd"> >>> px.setParams(outputCol="test").transform(df).head().test</span>
<span class="sd"> DenseVector([0.5, 0.25, 2.0, 1.0, 4.0])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">degree</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"degree"</span><span class="p">,</span> <span class="s">"the polynomial degree to expand (>= 1)"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, degree=2, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PolynomialExpansion</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span>
<span class="s">"org.apache.spark.ml.feature.PolynomialExpansion"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">degree</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"degree"</span><span class="p">,</span> <span class="s">"the polynomial degree to expand (>= 1)"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="PolynomialExpansion.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">degree</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, degree=2, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this PolynomialExpansion.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="PolynomialExpansion.setDegree"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion.setDegree">[docs]</a> <span class="k">def</span> <span class="nf">setDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`degree`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">degree</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="PolynomialExpansion.getDegree"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion.getDegree">[docs]</a> <span class="k">def</span> <span class="nf">getDegree</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of degree or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">degree</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<span class="nd">@ignore_unicode_prefix</span>
<div class="viewcode-block" id="RegexTokenizer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer">[docs]</a><span class="k">class</span> <span class="nc">RegexTokenizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> A regex based tokenizer that extracts tokens either by using the</span>
<span class="sd"> provided regex pattern (in Java dialect) to split the text</span>
<span class="sd"> (default) or repeatedly matching the regex (if gaps is false).</span>
<span class="sd"> Optional parameters also allow filtering tokens using a minimal</span>
<span class="sd"> length.</span>
<span class="sd"> It returns an array of strings that can be empty.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])</span>
<span class="sd"> >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")</span>
<span class="sd"> >>> reTokenizer.transform(df).head()</span>
<span class="sd"> Row(text=u'a b c', words=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> # Change a parameter.</span>
<span class="sd"> >>> reTokenizer.setParams(outputCol="tokens").transform(df).head()</span>
<span class="sd"> Row(text=u'a b c', tokens=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> # Temporarily modify a parameter.</span>
<span class="sd"> >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()</span>
<span class="sd"> Row(text=u'a b c', words=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> reTokenizer.transform(df).head()</span>
<span class="sd"> Row(text=u'a b c', tokens=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> # Must use keyword arguments to specify params.</span>
<span class="sd"> >>> reTokenizer.setParams("text")</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">minTokenLength</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"minTokenLength"</span><span class="p">,</span> <span class="s">"minimum token length (>= 0)"</span><span class="p">)</span>
<span class="n">gaps</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"gaps"</span><span class="p">,</span> <span class="s">"whether regex splits on gaps (True) or matches tokens"</span><span class="p">)</span>
<span class="n">pattern</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"pattern"</span><span class="p">,</span> <span class="s">"regex pattern (Java dialect) used for tokenizing"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s">"</span><span class="se">\\</span><span class="s">s+"</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RegexTokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.RegexTokenizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">minTokenLength</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"minTokenLength"</span><span class="p">,</span> <span class="s">"minimum token length (>= 0)"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">gaps</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"gaps"</span><span class="p">,</span> <span class="s">"whether regex splits on gaps (True) or matches tokens"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">pattern</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"pattern"</span><span class="p">,</span> <span class="s">"regex pattern (Java dialect) used for tokenizing"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s">"</span><span class="se">\\</span><span class="s">s+"</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="RegexTokenizer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">minTokenLength</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">gaps</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s">"</span><span class="se">\\</span><span class="s">s+"</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this RegexTokenizer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="RegexTokenizer.setMinTokenLength"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.setMinTokenLength">[docs]</a> <span class="k">def</span> <span class="nf">setMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`minTokenLength`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">minTokenLength</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="RegexTokenizer.getMinTokenLength"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.getMinTokenLength">[docs]</a> <span class="k">def</span> <span class="nf">getMinTokenLength</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of minTokenLength or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minTokenLength</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="RegexTokenizer.setGaps"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.setGaps">[docs]</a> <span class="k">def</span> <span class="nf">setGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`gaps`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">gaps</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="RegexTokenizer.getGaps"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.getGaps">[docs]</a> <span class="k">def</span> <span class="nf">getGaps</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of gaps or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">gaps</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="RegexTokenizer.setPattern"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.setPattern">[docs]</a> <span class="k">def</span> <span class="nf">setPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`pattern`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">pattern</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="RegexTokenizer.getPattern"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RegexTokenizer.getPattern">[docs]</a> <span class="k">def</span> <span class="nf">getPattern</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of pattern or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">pattern</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="StandardScaler"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScaler">[docs]</a><span class="k">class</span> <span class="nc">StandardScaler</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Standardizes features by removing the mean and scaling to unit variance using column summary</span>
<span class="sd"> statistics on the samples in the training set.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])</span>
<span class="sd"> >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")</span>
<span class="sd"> >>> model = standardScaler.fit(df)</span>
<span class="sd"> >>> model.mean</span>
<span class="sd"> DenseVector([1.0])</span>
<span class="sd"> >>> model.std</span>
<span class="sd"> DenseVector([1.4142])</span>
<span class="sd"> >>> model.transform(df).collect()[1].scaled</span>
<span class="sd"> DenseVector([1.4142])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">withMean</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"withMean"</span><span class="p">,</span> <span class="s">"Center data with mean"</span><span class="p">)</span>
<span class="n">withStd</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"withStd"</span><span class="p">,</span> <span class="s">"Scale to unit standard deviation"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">withMean</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StandardScaler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.StandardScaler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">withMean</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"withMean"</span><span class="p">,</span> <span class="s">"Center data with mean"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">withStd</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"withStd"</span><span class="p">,</span> <span class="s">"Scale to unit standard deviation"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">withMean</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="StandardScaler.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScaler.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">withMean</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">withStd</span><span class="o">=</span><span class="bp">True</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, withMean=False, withStd=True, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this StandardScaler.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="StandardScaler.setWithMean"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScaler.setWithMean">[docs]</a> <span class="k">def</span> <span class="nf">setWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`withMean`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">withMean</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="StandardScaler.getWithMean"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScaler.getWithMean">[docs]</a> <span class="k">def</span> <span class="nf">getWithMean</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of withMean or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withMean</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="StandardScaler.setWithStd"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScaler.setWithStd">[docs]</a> <span class="k">def</span> <span class="nf">setWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`withStd`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">withStd</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="StandardScaler.getWithStd"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScaler.getWithStd">[docs]</a> <span class="k">def</span> <span class="nf">getWithStd</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of withStd or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">withStd</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StandardScalerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="StandardScalerModel"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StandardScalerModel">[docs]</a><span class="k">class</span> <span class="nc">StandardScalerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by StandardScaler.</span>
<span class="sd"> """</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Standard deviation of the StandardScalerModel.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s">"std"</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Mean of the StandardScalerModel.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s">"mean"</span><span class="p">)</span>
</div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="StringIndexer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StringIndexer">[docs]</a><span class="k">class</span> <span class="nc">StringIndexer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> A label indexer that maps a string column of labels to an ML column of label indices.</span>
<span class="sd"> If the input column is numeric, we cast it to string and index the string values.</span>
<span class="sd"> The indices are in [0, numLabels), ordered by label frequencies.</span>
<span class="sd"> So the most frequent label gets index 0.</span>
<span class="sd"> >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")</span>
<span class="sd"> >>> model = stringIndexer.fit(stringIndDf)</span>
<span class="sd"> >>> td = model.transform(stringIndDf)</span>
<span class="sd"> >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),</span>
<span class="sd"> ... key=lambda x: x[0])</span>
<span class="sd"> [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]</span>
<span class="sd"> """</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">StringIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.StringIndexer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="StringIndexer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StringIndexer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this StringIndexer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">StringIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="StringIndexerModel"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.StringIndexerModel">[docs]</a><span class="k">class</span> <span class="nc">StringIndexerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by StringIndexer.</span>
<span class="sd"> """</span>
</div>
<span class="nd">@inherit_doc</span>
<span class="nd">@ignore_unicode_prefix</span>
<div class="viewcode-block" id="Tokenizer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Tokenizer">[docs]</a><span class="k">class</span> <span class="nc">Tokenizer</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> A tokenizer that converts the input string to lowercase and then</span>
<span class="sd"> splits it by white spaces.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])</span>
<span class="sd"> >>> tokenizer = Tokenizer(inputCol="text", outputCol="words")</span>
<span class="sd"> >>> tokenizer.transform(df).head()</span>
<span class="sd"> Row(text=u'a b c', words=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> # Change a parameter.</span>
<span class="sd"> >>> tokenizer.setParams(outputCol="tokens").transform(df).head()</span>
<span class="sd"> Row(text=u'a b c', tokens=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> # Temporarily modify a parameter.</span>
<span class="sd"> >>> tokenizer.transform(df, {tokenizer.outputCol: "words"}).head()</span>
<span class="sd"> Row(text=u'a b c', words=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> tokenizer.transform(df).head()</span>
<span class="sd"> Row(text=u'a b c', tokens=[u'a', u'b', u'c'])</span>
<span class="sd"> >>> # Must use keyword arguments to specify params.</span>
<span class="sd"> >>> tokenizer.setParams("text")</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> TypeError: Method setParams forces keyword arguments.</span>
<span class="sd"> """</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Tokenizer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.Tokenizer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="Tokenizer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Tokenizer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, inputCol="input", outputCol="output")</span>
<span class="sd"> Sets params for this Tokenizer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="VectorAssembler"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.VectorAssembler">[docs]</a><span class="k">class</span> <span class="nc">VectorAssembler</span><span class="p">(</span><span class="n">JavaTransformer</span><span class="p">,</span> <span class="n">HasInputCols</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> A feature transformer that merges multiple columns into a vector column.</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(1, 0, 3)], ["a", "b", "c"])</span>
<span class="sd"> >>> vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")</span>
<span class="sd"> >>> vecAssembler.transform(df).head().features</span>
<span class="sd"> DenseVector([1.0, 0.0, 3.0])</span>
<span class="sd"> >>> vecAssembler.setParams(outputCol="freqs").transform(df).head().freqs</span>
<span class="sd"> DenseVector([1.0, 0.0, 3.0])</span>
<span class="sd"> >>> params = {vecAssembler.inputCols: ["b", "a"], vecAssembler.outputCol: "vector"}</span>
<span class="sd"> >>> vecAssembler.transform(df, params).head().vector</span>
<span class="sd"> DenseVector([0.0, 1.0])</span>
<span class="sd"> """</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, inputCols=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorAssembler</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.VectorAssembler"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="VectorAssembler.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.VectorAssembler.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">inputCols</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, inputCols=None, outputCol=None)</span>
<span class="sd"> Sets params for this VectorAssembler.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="VectorIndexer"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.VectorIndexer">[docs]</a><span class="k">class</span> <span class="nc">VectorIndexer</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Class for indexing categorical feature columns in a dataset of [[Vector]].</span>
<span class="sd"> This has 2 usage modes:</span>
<span class="sd"> - Automatically identify categorical features (default behavior)</span>
<span class="sd"> - This helps process a dataset of unknown vectors into a dataset with some continuous</span>
<span class="sd"> features and some categorical features. The choice between continuous and categorical</span>
<span class="sd"> is based upon a maxCategories parameter.</span>
<span class="sd"> - Set maxCategories to the maximum number of categorical any categorical feature should</span>
<span class="sd"> have.</span>
<span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span>
<span class="sd"> If maxCategories = 2, then feature 0 will be declared categorical and use indices {0, 1},</span>
<span class="sd"> and feature 1 will be declared continuous.</span>
<span class="sd"> - Index all features, if all features are categorical</span>
<span class="sd"> - If maxCategories is set to be very large, then this will build an index of unique</span>
<span class="sd"> values for all features.</span>
<span class="sd"> - Warning: This can cause problems if features are continuous since this will collect ALL</span>
<span class="sd"> unique values to the driver.</span>
<span class="sd"> - E.g.: Feature 0 has unique values {-1.0, 0.0}, and feature 1 values {1.0, 3.0, 5.0}.</span>
<span class="sd"> If maxCategories >= 3, then both features will be declared categorical.</span>
<span class="sd"> This returns a model which can transform categorical features to use 0-based indices.</span>
<span class="sd"> Index stability:</span>
<span class="sd"> - This is not guaranteed to choose the same category index across multiple runs.</span>
<span class="sd"> - If a categorical feature includes value 0, then this is guaranteed to map value 0 to</span>
<span class="sd"> index 0. This maintains vector sparsity.</span>
<span class="sd"> - More stability may be added in the future.</span>
<span class="sd"> TODO: Future extensions: The following functionality is planned for the future:</span>
<span class="sd"> - Preserve metadata in transform; if a feature's metadata is already present,</span>
<span class="sd"> do not recompute.</span>
<span class="sd"> - Specify certain features to not index, either via a parameter or via existing metadata.</span>
<span class="sd"> - Add warning if a categorical feature has only 1 category.</span>
<span class="sd"> - Add option for allowing unknown categories.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([(Vectors.dense([-1.0, 0.0]),),</span>
<span class="sd"> ... (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])</span>
<span class="sd"> >>> indexer = VectorIndexer(maxCategories=2, inputCol="a", outputCol="indexed")</span>
<span class="sd"> >>> model = indexer.fit(df)</span>
<span class="sd"> >>> model.transform(df).head().indexed</span>
<span class="sd"> DenseVector([1.0, 0.0])</span>
<span class="sd"> >>> indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test</span>
<span class="sd"> DenseVector([0.0, 1.0])</span>
<span class="sd"> >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}</span>
<span class="sd"> >>> model2 = indexer.fit(df, params)</span>
<span class="sd"> >>> model2.transform(df).head().vector</span>
<span class="sd"> DenseVector([1.0, 0.0])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">maxCategories</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"maxCategories"</span><span class="p">,</span>
<span class="s">"Threshold for the number of values a categorical feature can take "</span> <span class="o">+</span>
<span class="s">"(>= 2). If a feature is found to have > maxCategories values, then "</span> <span class="o">+</span>
<span class="s">"it is declared continuous."</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, maxCategories=20, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">VectorIndexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.VectorIndexer"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">maxCategories</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"maxCategories"</span><span class="p">,</span>
<span class="s">"Threshold for the number of values a categorical feature "</span> <span class="o">+</span>
<span class="s">"can take (>= 2). If a feature is found to have "</span> <span class="o">+</span>
<span class="s">"> maxCategories values, then it is declared continuous."</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="VectorIndexer.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.VectorIndexer.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">maxCategories</span><span class="o">=</span><span class="mi">20</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, maxCategories=20, inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this VectorIndexer.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="VectorIndexer.setMaxCategories"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.VectorIndexer.setMaxCategories">[docs]</a> <span class="k">def</span> <span class="nf">setMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`maxCategories`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">maxCategories</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="VectorIndexer.getMaxCategories"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.VectorIndexer.getMaxCategories">[docs]</a> <span class="k">def</span> <span class="nf">getMaxCategories</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of maxCategories or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">maxCategories</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">VectorIndexerModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<span class="k">class</span> <span class="nc">VectorIndexerModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by VectorIndexer.</span>
<span class="sd"> """</span>
<span class="nd">@inherit_doc</span>
<span class="nd">@ignore_unicode_prefix</span>
<div class="viewcode-block" id="Word2Vec"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec">[docs]</a><span class="k">class</span> <span class="nc">Word2Vec</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasStepSize</span><span class="p">,</span> <span class="n">HasMaxIter</span><span class="p">,</span> <span class="n">HasSeed</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Word2Vec trains a model of `Map(String, Vector)`, i.e. transforms a word into a code for further</span>
<span class="sd"> natural language processing or machine learning process.</span>
<span class="sd"> >>> sent = ("a b " * 100 + "a c " * 10).split(" ")</span>
<span class="sd"> >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"])</span>
<span class="sd"> >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc)</span>
<span class="sd"> >>> model.getVectors().show()</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> |word| vector|</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> | a|[-0.3511952459812...|</span>
<span class="sd"> | b|[0.29077222943305...|</span>
<span class="sd"> | c|[0.02315592765808...|</span>
<span class="sd"> +----+--------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> >>> model.findSynonyms("a", 2).show()</span>
<span class="sd"> +----+-------------------+</span>
<span class="sd"> |word| similarity|</span>
<span class="sd"> +----+-------------------+</span>
<span class="sd"> | b|0.29255685145799626|</span>
<span class="sd"> | c|-0.5414068302988307|</span>
<span class="sd"> +----+-------------------+</span>
<span class="sd"> ...</span>
<span class="sd"> >>> model.transform(doc).head().model</span>
<span class="sd"> DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">vectorSize</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"vectorSize"</span><span class="p">,</span>
<span class="s">"the dimension of codes after transforming from words"</span><span class="p">)</span>
<span class="n">numPartitions</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"numPartitions"</span><span class="p">,</span>
<span class="s">"number of partitions for sentences of words"</span><span class="p">)</span>
<span class="n">minCount</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"minCount"</span><span class="p">,</span>
<span class="s">"the minimum number of times a token must appear to be included in the "</span> <span class="o">+</span>
<span class="s">"word2vec model's vocabulary"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">seed</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \</span>
<span class="sd"> seed=None, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Word2Vec</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.Word2Vec"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">vectorSize</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"vectorSize"</span><span class="p">,</span>
<span class="s">"the dimension of codes after transforming from words"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"numPartitions"</span><span class="p">,</span>
<span class="s">"number of partitions for sentences of words"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">minCount</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"minCount"</span><span class="p">,</span>
<span class="s">"the minimum number of times a token must appear to be included "</span> <span class="o">+</span>
<span class="s">"in the word2vec model's vocabulary"</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_setDefault</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">seed</span><span class="o">=</span><span class="bp">None</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="Word2Vec.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">vectorSize</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">stepSize</span><span class="o">=</span><span class="mf">0.025</span><span class="p">,</span> <span class="n">maxIter</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
<span class="n">seed</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \</span>
<span class="sd"> inputCol=None, outputCol=None)</span>
<span class="sd"> Sets params for this Word2Vec.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Word2Vec.setVectorSize"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.setVectorSize">[docs]</a> <span class="k">def</span> <span class="nf">setVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`vectorSize`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">vectorSize</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="Word2Vec.getVectorSize"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.getVectorSize">[docs]</a> <span class="k">def</span> <span class="nf">getVectorSize</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of vectorSize or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vectorSize</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Word2Vec.setNumPartitions"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.setNumPartitions">[docs]</a> <span class="k">def</span> <span class="nf">setNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`numPartitions`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="Word2Vec.getNumPartitions"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.getNumPartitions">[docs]</a> <span class="k">def</span> <span class="nf">getNumPartitions</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of numPartitions or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">numPartitions</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Word2Vec.setMinCount"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.setMinCount">[docs]</a> <span class="k">def</span> <span class="nf">setMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`minCount`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">minCount</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="Word2Vec.getMinCount"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2Vec.getMinCount">[docs]</a> <span class="k">def</span> <span class="nf">getMinCount</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of minCount or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">minCount</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Word2VecModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Word2VecModel"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2VecModel">[docs]</a><span class="k">class</span> <span class="nc">Word2VecModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by Word2Vec.</span>
<span class="sd"> """</span>
<div class="viewcode-block" id="Word2VecModel.getVectors"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2VecModel.getVectors">[docs]</a> <span class="k">def</span> <span class="nf">getVectors</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Returns the vector representation of the words as a dataframe</span>
<span class="sd"> with two fields, word and vector.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s">"getVectors"</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="Word2VecModel.findSynonyms"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.Word2VecModel.findSynonyms">[docs]</a> <span class="k">def</span> <span class="nf">findSynonyms</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Find "num" number of words closest in similarity to "word".</span>
<span class="sd"> word can be a string or vector representation.</span>
<span class="sd"> Returns a dataframe with two fields word and similarity (which</span>
<span class="sd"> gives the cosine similarity).</span>
<span class="sd"> """</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">word</span><span class="p">,</span> <span class="nb">basestring</span><span class="p">):</span>
<span class="n">word</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_call_java</span><span class="p">(</span><span class="s">"findSynonyms"</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">num</span><span class="p">)</span>
</div></div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="PCA"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PCA">[docs]</a><span class="k">class</span> <span class="nc">PCA</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasInputCol</span><span class="p">,</span> <span class="n">HasOutputCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> PCA trains a model to project vectors to a low-dimensional space using PCA.</span>
<span class="sd"> >>> from pyspark.mllib.linalg import Vectors</span>
<span class="sd"> >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),</span>
<span class="sd"> ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),</span>
<span class="sd"> ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]</span>
<span class="sd"> >>> df = sqlContext.createDataFrame(data,["features"])</span>
<span class="sd"> >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features")</span>
<span class="sd"> >>> model = pca.fit(df)</span>
<span class="sd"> >>> model.transform(df).collect()[0].pca_features</span>
<span class="sd"> DenseVector([1.648..., -4.013...])</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">k</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"k"</span><span class="p">,</span> <span class="s">"the number of principal components"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, k=None, inputCol=None, outputCol=None)</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">PCA</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.PCA"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">k</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"k"</span><span class="p">,</span> <span class="s">"the number of principal components"</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="PCA.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PCA.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, k=None, inputCol=None, outputCol=None)</span>
<span class="sd"> Set params for this PCA.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="PCA.setK"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PCA.setK">[docs]</a> <span class="k">def</span> <span class="nf">setK</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`k`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="PCA.getK"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PCA.getK">[docs]</a> <span class="k">def</span> <span class="nf">getK</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of k or its default value.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">k</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">PCAModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="PCAModel"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.PCAModel">[docs]</a><span class="k">class</span> <span class="nc">PCAModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by PCA.</span>
<span class="sd"> """</span>
</div>
<span class="nd">@inherit_doc</span>
<div class="viewcode-block" id="RFormula"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RFormula">[docs]</a><span class="k">class</span> <span class="nc">RFormula</span><span class="p">(</span><span class="n">JavaEstimator</span><span class="p">,</span> <span class="n">HasFeaturesCol</span><span class="p">,</span> <span class="n">HasLabelCol</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> .. note:: Experimental</span>
<span class="sd"> Implements the transforms required for fitting a dataset against an</span>
<span class="sd"> R model formula. Currently we support a limited subset of the R</span>
<span class="sd"> operators, including '~', '+', '-', and '.'. Also see the R formula</span>
<span class="sd"> docs:</span>
<span class="sd"> http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html</span>
<span class="sd"> >>> df = sqlContext.createDataFrame([</span>
<span class="sd"> ... (1.0, 1.0, "a"),</span>
<span class="sd"> ... (0.0, 2.0, "b"),</span>
<span class="sd"> ... (0.0, 0.0, "a")</span>
<span class="sd"> ... ], ["y", "x", "s"])</span>
<span class="sd"> >>> rf = RFormula(formula="y ~ x + s")</span>
<span class="sd"> >>> rf.fit(df).transform(df).show()</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> | y| x| s| features|label|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> |1.0|1.0| a|[1.0,1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b|[2.0,0.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a|[0.0,1.0]| 0.0|</span>
<span class="sd"> +---+---+---+---------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> >>> rf.fit(df, {rf.formula: "y ~ . - s"}).transform(df).show()</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> | y| x| s|features|label|</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> |1.0|1.0| a| [1.0]| 1.0|</span>
<span class="sd"> |0.0|2.0| b| [2.0]| 0.0|</span>
<span class="sd"> |0.0|0.0| a| [0.0]| 0.0|</span>
<span class="sd"> +---+---+---+--------+-----+</span>
<span class="sd"> ...</span>
<span class="sd"> """</span>
<span class="c"># a placeholder to make it appear in the generated doc</span>
<span class="n">formula</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="n">Params</span><span class="o">.</span><span class="n">_dummy</span><span class="p">(),</span> <span class="s">"formula"</span><span class="p">,</span> <span class="s">"R model formula"</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">formula</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s">"label"</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> __init__(self, formula=None, featuresCol="features", labelCol="label")</span>
<span class="sd"> """</span>
<span class="nb">super</span><span class="p">(</span><span class="n">RFormula</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_java_obj</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_new_java_obj</span><span class="p">(</span><span class="s">"org.apache.spark.ml.feature.RFormula"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">uid</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">formula</span> <span class="o">=</span> <span class="n">Param</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s">"formula"</span><span class="p">,</span> <span class="s">"R model formula"</span><span class="p">)</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__init__</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="nd">@keyword_only</span>
<div class="viewcode-block" id="RFormula.setParams"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RFormula.setParams">[docs]</a> <span class="k">def</span> <span class="nf">setParams</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">formula</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">featuresCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">labelCol</span><span class="o">=</span><span class="s">"label"</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> setParams(self, formula=None, featuresCol="features", labelCol="label")</span>
<span class="sd"> Sets params for RFormula.</span>
<span class="sd"> """</span>
<span class="n">kwargs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">setParams</span><span class="o">.</span><span class="n">_input_kwargs</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_set</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="RFormula.setFormula"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RFormula.setFormula">[docs]</a> <span class="k">def</span> <span class="nf">setFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Sets the value of :py:attr:`formula`.</span>
<span class="sd"> """</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_paramMap</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">]</span> <span class="o">=</span> <span class="n">value</span>
<span class="k">return</span> <span class="bp">self</span>
</div>
<div class="viewcode-block" id="RFormula.getFormula"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RFormula.getFormula">[docs]</a> <span class="k">def</span> <span class="nf">getFormula</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Gets the value of :py:attr:`formula`.</span>
<span class="sd"> """</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">getOrDefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">formula</span><span class="p">)</span>
</div>
<span class="k">def</span> <span class="nf">_create_model</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">java_model</span><span class="p">):</span>
<span class="k">return</span> <span class="n">RFormulaModel</span><span class="p">(</span><span class="n">java_model</span><span class="p">)</span>
</div>
<div class="viewcode-block" id="RFormulaModel"><a class="viewcode-back" href="../../../pyspark.ml.html#pyspark.ml.feature.RFormulaModel">[docs]</a><span class="k">class</span> <span class="nc">RFormulaModel</span><span class="p">(</span><span class="n">JavaModel</span><span class="p">):</span>
<span class="sd">"""</span>
<span class="sd"> Model fitted by :py:class:`RFormula`.</span>
<span class="sd"> """</span>
</div>
<span class="k">if</span> <span class="n">__name__</span> <span class="o">==</span> <span class="s">"__main__"</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">from</span> <span class="nn">pyspark.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span><span class="p">,</span> <span class="n">SQLContext</span>
<span class="n">globs</span> <span class="o">=</span> <span class="nb">globals</span><span class="p">()</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="c"># The small batch size here ensures that we see multiple batches,</span>
<span class="c"># even in these small test examples:</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="p">(</span><span class="s">"local[2]"</span><span class="p">,</span> <span class="s">"ml.feature tests"</span><span class="p">)</span>
<span class="n">sqlContext</span> <span class="o">=</span> <span class="n">SQLContext</span><span class="p">(</span><span class="n">sc</span><span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s">'sc'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span>
<span class="n">globs</span><span class="p">[</span><span class="s">'sqlContext'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sqlContext</span>
<span class="n">testData</span> <span class="o">=</span> <span class="n">sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">([</span><span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">"a"</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">"b"</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">"c"</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">"a"</span><span class="p">),</span>
<span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">"a"</span><span class="p">),</span> <span class="n">Row</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s">"c"</span><span class="p">)],</span> <span class="mi">2</span><span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s">'stringIndDf'</span><span class="p">]</span> <span class="o">=</span> <span class="n">sqlContext</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">testData</span><span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span>
<span class="n">sc</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="nb">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<p class="logo"><a href="../../../index.html">
<img class="logo" src="../../../_static/spark-logo-hd.png" alt="Logo"/>
</a></p>
<div id="searchbox" style="display: none" role="search">
<h3>Quick search</h3>
<form class="search" action="../../../search.html" method="get">
<input type="text" name="q" />
<input type="submit" value="Go" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
<p class="searchtip" style="font-size: 90%">
Enter search terms or a module, class or function name.
</p>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="related" role="navigation" aria-label="related navigation">
<h3>Navigation</h3>
<ul>
<li class="nav-item nav-item-0"><a href="../../../index.html">PySpark 1.5.0 documentation</a> »</li>
<li class="nav-item nav-item-1"><a href="../../index.html" >Module code</a> »</li>
</ul>
</div>
<div class="footer" role="contentinfo">
© Copyright .
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.3.1.
</div>
</body>
</html>