|
|
<?xml version="1.0" encoding="ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>pyspark.mllib.util</title>
<link rel="stylesheet" href="epydoc.css" type="text/css" />
<script type="text/javascript" src="epydoc.js"></script>
</head>
<body bgcolor="white" text="black" link="blue" vlink="#204080"
alink="#204080">
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
bgcolor="#a0c0ff" cellspacing="0">
<tr valign="middle">
<!-- Home link -->
<th> <a
href="pyspark-module.html">Home</a> </th>
<!-- Tree link -->
<th> <a
href="module-tree.html">Trees</a> </th>
<!-- Index link -->
<th> <a
href="identifier-index.html">Indices</a> </th>
<!-- Help link -->
<th> <a
href="help.html">Help</a> </th>
<!-- Project homepage -->
<th class="navbar" align="right" width="100%">
<table border="0" cellpadding="0" cellspacing="0">
<tr><th class="navbar" align="center"
><a class="navbar" target="_top" href="http://spark.apache.org">Spark 1.0.1 Python API Docs</a></th>
</tr></table></th>
</tr>
</table>
<table width="100%" cellpadding="0" cellspacing="0">
<tr valign="top">
<td width="100%">
<span class="breadcrumbs">
<a href="pyspark-module.html">Package pyspark</a> ::
<a href="pyspark.mllib-module.html">Package mllib</a> ::
Module util
</span>
</td>
<td>
<table cellpadding="0" cellspacing="0">
<!-- hide/show private -->
<tr><td align="right"><span class="options"
>[<a href="frames.html" target="_top">frames</a
>] | <a href="pyspark.mllib.util-pysrc.html"
target="_top">no frames</a>]</span></td></tr>
</table>
</td>
</tr>
</table>
<h1 class="epydoc">Source Code for <a href="pyspark.mllib.util-module.html">Module pyspark.mllib.util</a></h1>
<pre class="py-src">
<a name="L1"></a><tt class="py-lineno"> 1</tt> <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L2"></a><tt class="py-lineno"> 2</tt> <tt class="py-line"><tt class="py-comment"># Licensed to the Apache Software Foundation (ASF) under one or more</tt> </tt>
<a name="L3"></a><tt class="py-lineno"> 3</tt> <tt class="py-line"><tt class="py-comment"># contributor license agreements. See the NOTICE file distributed with</tt> </tt>
<a name="L4"></a><tt class="py-lineno"> 4</tt> <tt class="py-line"><tt class="py-comment"># this work for additional information regarding copyright ownership.</tt> </tt>
<a name="L5"></a><tt class="py-lineno"> 5</tt> <tt class="py-line"><tt class="py-comment"># The ASF licenses this file to You under the Apache License, Version 2.0</tt> </tt>
<a name="L6"></a><tt class="py-lineno"> 6</tt> <tt class="py-line"><tt class="py-comment"># (the "License"); you may not use this file except in compliance with</tt> </tt>
<a name="L7"></a><tt class="py-lineno"> 7</tt> <tt class="py-line"><tt class="py-comment"># the License. You may obtain a copy of the License at</tt> </tt>
<a name="L8"></a><tt class="py-lineno"> 8</tt> <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L9"></a><tt class="py-lineno"> 9</tt> <tt class="py-line"><tt class="py-comment"># http://www.apache.org/licenses/LICENSE-2.0</tt> </tt>
<a name="L10"></a><tt class="py-lineno"> 10</tt> <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L11"></a><tt class="py-lineno"> 11</tt> <tt class="py-line"><tt class="py-comment"># Unless required by applicable law or agreed to in writing, software</tt> </tt>
<a name="L12"></a><tt class="py-lineno"> 12</tt> <tt class="py-line"><tt class="py-comment"># distributed under the License is distributed on an "AS IS" BASIS,</tt> </tt>
<a name="L13"></a><tt class="py-lineno"> 13</tt> <tt class="py-line"><tt class="py-comment"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</tt> </tt>
<a name="L14"></a><tt class="py-lineno"> 14</tt> <tt class="py-line"><tt class="py-comment"># See the License for the specific language governing permissions and</tt> </tt>
<a name="L15"></a><tt class="py-lineno"> 15</tt> <tt class="py-line"><tt class="py-comment"># limitations under the License.</tt> </tt>
<a name="L16"></a><tt class="py-lineno"> 16</tt> <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L17"></a><tt class="py-lineno"> 17</tt> <tt class="py-line"> </tt>
<a name="L18"></a><tt class="py-lineno"> 18</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">numpy</tt> <tt class="py-keyword">as</tt> <tt class="py-name">np</tt> </tt>
<a name="L19"></a><tt class="py-lineno"> 19</tt> <tt class="py-line"> </tt>
<a name="L20"></a><tt class="py-lineno"> 20</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-0" class="py-name" targets="Package pyspark=pyspark-module.html"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-0', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-1" class="py-name" targets="Package pyspark.mllib=pyspark.mllib-module.html"><a title="pyspark.mllib" class="py-name" href="#" onclick="return doclink('link-1', 'mllib', 'link-1');">mllib</a></tt><tt class="py-op">.</tt><tt id="link-2" class="py-name" targets="Module pyspark.mllib.linalg=pyspark.mllib.linalg-module.html"><a title="pyspark.mllib.linalg" class="py-name" href="#" onclick="return doclink('link-2', 'linalg', 'link-2');">linalg</a></tt> <tt class="py-keyword">import</tt> <tt id="link-3" class="py-name" targets="Class pyspark.mllib.linalg.Vectors=pyspark.mllib.linalg.Vectors-class.html"><a title="pyspark.mllib.linalg.Vectors" class="py-name" href="#" onclick="return doclink('link-3', 'Vectors', 'link-3');">Vectors</a></tt><tt class="py-op">,</tt> <tt id="link-4" class="py-name" targets="Class pyspark.mllib.linalg.SparseVector=pyspark.mllib.linalg.SparseVector-class.html"><a title="pyspark.mllib.linalg.SparseVector" class="py-name" href="#" onclick="return doclink('link-4', 'SparseVector', 'link-4');">SparseVector</a></tt> </tt>
<a name="L21"></a><tt class="py-lineno"> 21</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-5" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-5', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-6" class="py-name"><a title="pyspark.mllib" class="py-name" href="#" onclick="return doclink('link-6', 'mllib', 'link-1');">mllib</a></tt><tt class="py-op">.</tt><tt id="link-7" class="py-name" targets="Module pyspark.mllib.regression=pyspark.mllib.regression-module.html"><a title="pyspark.mllib.regression" class="py-name" href="#" onclick="return doclink('link-7', 'regression', 'link-7');">regression</a></tt> <tt class="py-keyword">import</tt> <tt id="link-8" class="py-name" targets="Class pyspark.mllib.regression.LabeledPoint=pyspark.mllib.regression.LabeledPoint-class.html"><a title="pyspark.mllib.regression.LabeledPoint" class="py-name" href="#" onclick="return doclink('link-8', 'LabeledPoint', 'link-8');">LabeledPoint</a></tt> </tt>
<a name="L22"></a><tt class="py-lineno"> 22</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-9" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-9', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-10" class="py-name"><a title="pyspark.mllib" class="py-name" href="#" onclick="return doclink('link-10', 'mllib', 'link-1');">mllib</a></tt><tt class="py-op">.</tt><tt class="py-name">_common</tt> <tt class="py-keyword">import</tt> <tt class="py-name">_convert_vector</tt> </tt>
<a name="MLUtils"></a><div id="MLUtils-def"><a name="L23"></a><tt class="py-lineno"> 23</tt> <tt class="py-line"> </tt>
<a name="L24"></a><tt class="py-lineno"> 24</tt> <tt class="py-line"> </tt>
<a name="L25"></a><tt class="py-lineno"> 25</tt> <a class="py-toggle" href="#" id="MLUtils-toggle" onclick="return toggle('MLUtils');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.mllib.util.MLUtils-class.html">MLUtils</a><tt class="py-op">:</tt> </tt>
</div><div id="MLUtils-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="MLUtils-expanded"><a name="L26"></a><tt class="py-lineno"> 26</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
<a name="L27"></a><tt class="py-lineno"> 27</tt> <tt class="py-line"><tt class="py-docstring"> Helper methods to load, save and pre-process data used in MLlib.</tt> </tt>
<a name="L28"></a><tt class="py-lineno"> 28</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
<a name="L29"></a><tt class="py-lineno"> 29</tt> <tt class="py-line"> </tt>
<a name="L30"></a><tt class="py-lineno"> 30</tt> <tt class="py-line"> <tt class="py-decorator">@</tt><tt class="py-decorator">staticmethod</tt> </tt>
<a name="MLUtils._parse_libsvm_line"></a><div id="MLUtils._parse_libsvm_line-def"><a name="L31"></a><tt class="py-lineno"> 31</tt> <a class="py-toggle" href="#" id="MLUtils._parse_libsvm_line-toggle" onclick="return toggle('MLUtils._parse_libsvm_line');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.mllib.util.MLUtils-class.html#_parse_libsvm_line">_parse_libsvm_line</a><tt class="py-op">(</tt><tt class="py-param">line</tt><tt class="py-op">,</tt> <tt class="py-param">multiclass</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="MLUtils._parse_libsvm_line-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="MLUtils._parse_libsvm_line-expanded"><a name="L32"></a><tt class="py-lineno"> 32</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
<a name="L33"></a><tt class="py-lineno"> 33</tt> <tt class="py-line"><tt class="py-docstring"> Parses a line in LIBSVM format into (label, indices, values).</tt> </tt>
<a name="L34"></a><tt class="py-lineno"> 34</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
<a name="L35"></a><tt class="py-lineno"> 35</tt> <tt class="py-line"> <tt class="py-name">items</tt> <tt class="py-op">=</tt> <tt class="py-name">line</tt><tt class="py-op">.</tt><tt class="py-name">split</tt><tt class="py-op">(</tt><tt class="py-name">None</tt><tt class="py-op">)</tt> </tt>
<a name="L36"></a><tt class="py-lineno"> 36</tt> <tt class="py-line"> <tt class="py-name">label</tt> <tt class="py-op">=</tt> <tt class="py-name">float</tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
<a name="L37"></a><tt class="py-lineno"> 37</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">multiclass</tt><tt class="py-op">:</tt> </tt>
<a name="L38"></a><tt class="py-lineno"> 38</tt> <tt class="py-line"> <tt class="py-name">label</tt> <tt class="py-op">=</tt> <tt class="py-number">1.0</tt> <tt class="py-keyword">if</tt> <tt class="py-name">label</tt> <tt class="py-op">></tt> <tt class="py-number">0.5</tt> <tt class="py-keyword">else</tt> <tt class="py-number">0.0</tt> </tt>
<a name="L39"></a><tt class="py-lineno"> 39</tt> <tt class="py-line"> <tt class="py-name">nnz</tt> <tt class="py-op">=</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">)</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt> </tt>
<a name="L40"></a><tt class="py-lineno"> 40</tt> <tt class="py-line"> <tt class="py-name">indices</tt> <tt class="py-op">=</tt> <tt class="py-name">np</tt><tt class="py-op">.</tt><tt class="py-name">zeros</tt><tt class="py-op">(</tt><tt class="py-name">nnz</tt><tt class="py-op">,</tt> <tt class="py-name">dtype</tt><tt class="py-op">=</tt><tt class="py-name">np</tt><tt class="py-op">.</tt><tt class="py-name">int32</tt><tt class="py-op">)</tt> </tt>
<a name="L41"></a><tt class="py-lineno"> 41</tt> <tt class="py-line"> <tt id="link-11" class="py-name" targets="Method pyspark.rdd.RDD.values()=pyspark.rdd.RDD-class.html#values"><a title="pyspark.rdd.RDD.values" class="py-name" href="#" onclick="return doclink('link-11', 'values', 'link-11');">values</a></tt> <tt class="py-op">=</tt> <tt class="py-name">np</tt><tt class="py-op">.</tt><tt class="py-name">zeros</tt><tt class="py-op">(</tt><tt class="py-name">nnz</tt><tt class="py-op">)</tt> </tt>
<a name="L42"></a><tt class="py-lineno"> 42</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">i</tt> <tt class="py-keyword">in</tt> <tt class="py-name">xrange</tt><tt class="py-op">(</tt><tt class="py-name">nnz</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L43"></a><tt class="py-lineno"> 43</tt> <tt class="py-line"> <tt class="py-name">index</tt><tt class="py-op">,</tt> <tt id="link-12" class="py-name" targets="Method pyspark.accumulators.Accumulator.value()=pyspark.accumulators.Accumulator-class.html#value"><a title="pyspark.accumulators.Accumulator.value" class="py-name" href="#" onclick="return doclink('link-12', 'value', 'link-12');">value</a></tt> <tt class="py-op">=</tt> <tt class="py-name">items</tt><tt class="py-op">[</tt><tt class="py-number">1</tt> <tt class="py-op">+</tt> <tt class="py-name">i</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">split</tt><tt class="py-op">(</tt><tt class="py-string">":"</tt><tt class="py-op">)</tt> </tt>
<a name="L44"></a><tt class="py-lineno"> 44</tt> <tt class="py-line"> <tt class="py-name">indices</tt><tt class="py-op">[</tt><tt class="py-name">i</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">int</tt><tt class="py-op">(</tt><tt class="py-name">index</tt><tt class="py-op">)</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt> </tt>
<a name="L45"></a><tt class="py-lineno"> 45</tt> <tt class="py-line"> <tt id="link-13" class="py-name"><a title="pyspark.rdd.RDD.values" class="py-name" href="#" onclick="return doclink('link-13', 'values', 'link-11');">values</a></tt><tt class="py-op">[</tt><tt class="py-name">i</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">float</tt><tt class="py-op">(</tt><tt id="link-14" class="py-name"><a title="pyspark.accumulators.Accumulator.value" class="py-name" href="#" onclick="return doclink('link-14', 'value', 'link-12');">value</a></tt><tt class="py-op">)</tt> </tt>
<a name="L46"></a><tt class="py-lineno"> 46</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">label</tt><tt class="py-op">,</tt> <tt class="py-name">indices</tt><tt class="py-op">,</tt> <tt id="link-15" class="py-name"><a title="pyspark.rdd.RDD.values" class="py-name" href="#" onclick="return doclink('link-15', 'values', 'link-11');">values</a></tt> </tt>
</div><a name="L47"></a><tt class="py-lineno"> 47</tt> <tt class="py-line"> </tt>
<a name="L48"></a><tt class="py-lineno"> 48</tt> <tt class="py-line"> <tt class="py-decorator">@</tt><tt class="py-decorator">staticmethod</tt> </tt>
<a name="MLUtils._convert_labeled_point_to_libsvm"></a><div id="MLUtils._convert_labeled_point_to_libsvm-def"><a name="L49"></a><tt class="py-lineno"> 49</tt> <a class="py-toggle" href="#" id="MLUtils._convert_labeled_point_to_libsvm-toggle" onclick="return toggle('MLUtils._convert_labeled_point_to_libsvm');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.mllib.util.MLUtils-class.html#_convert_labeled_point_to_libsvm">_convert_labeled_point_to_libsvm</a><tt class="py-op">(</tt><tt class="py-param">p</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="MLUtils._convert_labeled_point_to_libsvm-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="MLUtils._convert_labeled_point_to_libsvm-expanded"><a name="L50"></a><tt class="py-lineno"> 50</tt> <tt class="py-line"> <tt class="py-docstring">"""Converts a LabeledPoint to a string in LIBSVM format."""</tt> </tt>
<a name="L51"></a><tt class="py-lineno"> 51</tt> <tt class="py-line"> <tt class="py-name">items</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">p</tt><tt class="py-op">.</tt><tt class="py-name">label</tt><tt class="py-op">)</tt><tt class="py-op">]</tt> </tt>
<a name="L52"></a><tt class="py-lineno"> 52</tt> <tt class="py-line"> <tt class="py-name">v</tt> <tt class="py-op">=</tt> <tt class="py-name">_convert_vector</tt><tt class="py-op">(</tt><tt class="py-name">p</tt><tt class="py-op">.</tt><tt class="py-name">features</tt><tt class="py-op">)</tt> </tt>
<a name="L53"></a><tt class="py-lineno"> 53</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-name">np</tt><tt class="py-op">.</tt><tt class="py-name">ndarray</tt><tt class="py-op">:</tt> </tt>
<a name="L54"></a><tt class="py-lineno"> 54</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">i</tt> <tt class="py-keyword">in</tt> <tt class="py-name">xrange</tt><tt class="py-op">(</tt><tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L55"></a><tt class="py-lineno"> 55</tt> <tt class="py-line"> <tt class="py-name">items</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">i</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> <tt class="py-op">+</tt> <tt class="py-string">":"</tt> <tt class="py-op">+</tt> <tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">[</tt><tt class="py-name">i</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L56"></a><tt class="py-lineno"> 56</tt> <tt class="py-line"> <tt class="py-keyword">elif</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt id="link-16" class="py-name"><a title="pyspark.mllib.linalg.SparseVector" class="py-name" href="#" onclick="return doclink('link-16', 'SparseVector', 'link-4');">SparseVector</a></tt><tt class="py-op">:</tt> </tt>
<a name="L57"></a><tt class="py-lineno"> 57</tt> <tt class="py-line"> <tt class="py-name">nnz</tt> <tt class="py-op">=</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">.</tt><tt class="py-name">indices</tt><tt class="py-op">)</tt> </tt>
<a name="L58"></a><tt class="py-lineno"> 58</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">i</tt> <tt class="py-keyword">in</tt> <tt class="py-name">xrange</tt><tt class="py-op">(</tt><tt class="py-name">nnz</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L59"></a><tt class="py-lineno"> 59</tt> <tt class="py-line"> <tt class="py-name">items</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">.</tt><tt class="py-name">indices</tt><tt class="py-op">[</tt><tt class="py-name">i</tt><tt class="py-op">]</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> <tt class="py-op">+</tt> <tt class="py-string">":"</tt> <tt class="py-op">+</tt> <tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">.</tt><tt id="link-17" class="py-name"><a title="pyspark.rdd.RDD.values" class="py-name" href="#" onclick="return doclink('link-17', 'values', 'link-11');">values</a></tt><tt class="py-op">[</tt><tt class="py-name">i</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L60"></a><tt class="py-lineno"> 60</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L61"></a><tt class="py-lineno"> 61</tt> <tt class="py-line"> <tt class="py-keyword">raise</tt> <tt class="py-name">TypeError</tt><tt class="py-op">(</tt><tt class="py-string">"_convert_labeled_point_to_libsvm needs either ndarray or SparseVector"</tt> </tt>
<a name="L62"></a><tt class="py-lineno"> 62</tt> <tt class="py-line"> <tt class="py-string">" but got "</tt> <tt class="py-op">%</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L63"></a><tt class="py-lineno"> 63</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-string">" "</tt><tt class="py-op">.</tt><tt id="link-18" class="py-name" targets="Method pyspark.rdd.RDD.join()=pyspark.rdd.RDD-class.html#join"><a title="pyspark.rdd.RDD.join" class="py-name" href="#" onclick="return doclink('link-18', 'join', 'link-18');">join</a></tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">)</tt> </tt>
</div><a name="L64"></a><tt class="py-lineno"> 64</tt> <tt class="py-line"> </tt>
<a name="L65"></a><tt class="py-lineno"> 65</tt> <tt class="py-line"> <tt class="py-decorator">@</tt><tt class="py-decorator">staticmethod</tt> </tt>
<a name="MLUtils.loadLibSVMFile"></a><div id="MLUtils.loadLibSVMFile-def"><a name="L66"></a><tt class="py-lineno"> 66</tt> <a class="py-toggle" href="#" id="MLUtils.loadLibSVMFile-toggle" onclick="return toggle('MLUtils.loadLibSVMFile');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.mllib.util.MLUtils-class.html#loadLibSVMFile">loadLibSVMFile</a><tt class="py-op">(</tt><tt class="py-param">sc</tt><tt class="py-op">,</tt> <tt class="py-param">path</tt><tt class="py-op">,</tt> <tt class="py-param">multiclass</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">,</tt> <tt class="py-param">numFeatures</tt><tt class="py-op">=</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">,</tt> <tt class="py-param">minPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="MLUtils.loadLibSVMFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="MLUtils.loadLibSVMFile-expanded"><a name="L67"></a><tt class="py-lineno"> 67</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
<a name="L68"></a><tt class="py-lineno"> 68</tt> <tt class="py-line"><tt class="py-docstring"> Loads labeled data in the LIBSVM format into an RDD of</tt> </tt>
<a name="L69"></a><tt class="py-lineno"> 69</tt> <tt class="py-line"><tt class="py-docstring"> LabeledPoint. The LIBSVM format is a text-based format used by</tt> </tt>
<a name="L70"></a><tt class="py-lineno"> 70</tt> <tt class="py-line"><tt class="py-docstring"> LIBSVM and LIBLINEAR. Each line represents a labeled sparse</tt> </tt>
<a name="L71"></a><tt class="py-lineno"> 71</tt> <tt class="py-line"><tt class="py-docstring"> feature vector using the following format:</tt> </tt>
<a name="L72"></a><tt class="py-lineno"> 72</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L73"></a><tt class="py-lineno"> 73</tt> <tt class="py-line"><tt class="py-docstring"> label index1:value1 index2:value2 ...</tt> </tt>
<a name="L74"></a><tt class="py-lineno"> 74</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L75"></a><tt class="py-lineno"> 75</tt> <tt class="py-line"><tt class="py-docstring"> where the indices are one-based and in ascending order. This</tt> </tt>
<a name="L76"></a><tt class="py-lineno"> 76</tt> <tt class="py-line"><tt class="py-docstring"> method parses each line into a LabeledPoint, where the feature</tt> </tt>
<a name="L77"></a><tt class="py-lineno"> 77</tt> <tt class="py-line"><tt class="py-docstring"> indices are converted to zero-based.</tt> </tt>
<a name="L78"></a><tt class="py-lineno"> 78</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L79"></a><tt class="py-lineno"> 79</tt> <tt class="py-line"><tt class="py-docstring"> @param sc: Spark context</tt> </tt>
<a name="L80"></a><tt class="py-lineno"> 80</tt> <tt class="py-line"><tt class="py-docstring"> @param path: file or directory path in any Hadoop-supported file</tt> </tt>
<a name="L81"></a><tt class="py-lineno"> 81</tt> <tt class="py-line"><tt class="py-docstring"> system URI</tt> </tt>
<a name="L82"></a><tt class="py-lineno"> 82</tt> <tt class="py-line"><tt class="py-docstring"> @param multiclass: whether the input labels contain more than</tt> </tt>
<a name="L83"></a><tt class="py-lineno"> 83</tt> <tt class="py-line"><tt class="py-docstring"> two classes. If false, any label with value</tt> </tt>
<a name="L84"></a><tt class="py-lineno"> 84</tt> <tt class="py-line"><tt class="py-docstring"> greater than 0.5 will be mapped to 1.0, or</tt> </tt>
<a name="L85"></a><tt class="py-lineno"> 85</tt> <tt class="py-line"><tt class="py-docstring"> 0.0 otherwise. So it works for both +1/-1 and</tt> </tt>
<a name="L86"></a><tt class="py-lineno"> 86</tt> <tt class="py-line"><tt class="py-docstring"> 1/0 cases. If true, the double value parsed</tt> </tt>
<a name="L87"></a><tt class="py-lineno"> 87</tt> <tt class="py-line"><tt class="py-docstring"> directly from the label string will be used</tt> </tt>
<a name="L88"></a><tt class="py-lineno"> 88</tt> <tt class="py-line"><tt class="py-docstring"> as the label value.</tt> </tt>
<a name="L89"></a><tt class="py-lineno"> 89</tt> <tt class="py-line"><tt class="py-docstring"> @param numFeatures: number of features, which will be determined</tt> </tt>
<a name="L90"></a><tt class="py-lineno"> 90</tt> <tt class="py-line"><tt class="py-docstring"> from the input data if a nonpositive value</tt> </tt>
<a name="L91"></a><tt class="py-lineno"> 91</tt> <tt class="py-line"><tt class="py-docstring"> is given. This is useful when the dataset is</tt> </tt>
<a name="L92"></a><tt class="py-lineno"> 92</tt> <tt class="py-line"><tt class="py-docstring"> already split into multiple files and you</tt> </tt>
<a name="L93"></a><tt class="py-lineno"> 93</tt> <tt class="py-line"><tt class="py-docstring"> want to load them separately, because some</tt> </tt>
<a name="L94"></a><tt class="py-lineno"> 94</tt> <tt class="py-line"><tt class="py-docstring"> features may not present in certain files,</tt> </tt>
<a name="L95"></a><tt class="py-lineno"> 95</tt> <tt class="py-line"><tt class="py-docstring"> which leads to inconsistent feature</tt> </tt>
<a name="L96"></a><tt class="py-lineno"> 96</tt> <tt class="py-line"><tt class="py-docstring"> dimensions.</tt> </tt>
<a name="L97"></a><tt class="py-lineno"> 97</tt> <tt class="py-line"><tt class="py-docstring"> @param minPartitions: min number of partitions</tt> </tt>
<a name="L98"></a><tt class="py-lineno"> 98</tt> <tt class="py-line"><tt class="py-docstring"> @return: labeled data stored as an RDD of LabeledPoint</tt> </tt>
<a name="L99"></a><tt class="py-lineno"> 99</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L100"></a><tt class="py-lineno">100</tt> <tt class="py-line"><tt class="py-docstring"> >>> from tempfile import NamedTemporaryFile</tt> </tt>
<a name="L101"></a><tt class="py-lineno">101</tt> <tt class="py-line"><tt class="py-docstring"> >>> from pyspark.mllib.util import MLUtils</tt> </tt>
<a name="L102"></a><tt class="py-lineno">102</tt> <tt class="py-line"><tt class="py-docstring"> >>> tempFile = NamedTemporaryFile(delete=True)</tt> </tt>
<a name="L103"></a><tt class="py-lineno">103</tt> <tt class="py-line"><tt class="py-docstring"> >>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")</tt> </tt>
<a name="L104"></a><tt class="py-lineno">104</tt> <tt class="py-line"><tt class="py-docstring"> >>> tempFile.flush()</tt> </tt>
<a name="L105"></a><tt class="py-lineno">105</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()</tt> </tt>
<a name="L106"></a><tt class="py-lineno">106</tt> <tt class="py-line"><tt class="py-docstring"> >>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name, True).collect()</tt> </tt>
<a name="L107"></a><tt class="py-lineno">107</tt> <tt class="py-line"><tt class="py-docstring"> >>> tempFile.close()</tt> </tt>
<a name="L108"></a><tt class="py-lineno">108</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples[0].label</tt> </tt>
<a name="L109"></a><tt class="py-lineno">109</tt> <tt class="py-line"><tt class="py-docstring"> 1.0</tt> </tt>
<a name="L110"></a><tt class="py-lineno">110</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples[0].features.size</tt> </tt>
<a name="L111"></a><tt class="py-lineno">111</tt> <tt class="py-line"><tt class="py-docstring"> 6</tt> </tt>
<a name="L112"></a><tt class="py-lineno">112</tt> <tt class="py-line"><tt class="py-docstring"> >>> print examples[0].features</tt> </tt>
<a name="L113"></a><tt class="py-lineno">113</tt> <tt class="py-line"><tt class="py-docstring"> [0: 1.0, 2: 2.0, 4: 3.0]</tt> </tt>
<a name="L114"></a><tt class="py-lineno">114</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples[1].label</tt> </tt>
<a name="L115"></a><tt class="py-lineno">115</tt> <tt class="py-line"><tt class="py-docstring"> 0.0</tt> </tt>
<a name="L116"></a><tt class="py-lineno">116</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples[1].features.size</tt> </tt>
<a name="L117"></a><tt class="py-lineno">117</tt> <tt class="py-line"><tt class="py-docstring"> 6</tt> </tt>
<a name="L118"></a><tt class="py-lineno">118</tt> <tt class="py-line"><tt class="py-docstring"> >>> print examples[1].features</tt> </tt>
<a name="L119"></a><tt class="py-lineno">119</tt> <tt class="py-line"><tt class="py-docstring"> []</tt> </tt>
<a name="L120"></a><tt class="py-lineno">120</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples[2].label</tt> </tt>
<a name="L121"></a><tt class="py-lineno">121</tt> <tt class="py-line"><tt class="py-docstring"> 0.0</tt> </tt>
<a name="L122"></a><tt class="py-lineno">122</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples[2].features.size</tt> </tt>
<a name="L123"></a><tt class="py-lineno">123</tt> <tt class="py-line"><tt class="py-docstring"> 6</tt> </tt>
<a name="L124"></a><tt class="py-lineno">124</tt> <tt class="py-line"><tt class="py-docstring"> >>> print examples[2].features</tt> </tt>
<a name="L125"></a><tt class="py-lineno">125</tt> <tt class="py-line"><tt class="py-docstring"> [1: 4.0, 3: 5.0, 5: 6.0]</tt> </tt>
<a name="L126"></a><tt class="py-lineno">126</tt> <tt class="py-line"><tt class="py-docstring"> >>> multiclass_examples[1].label</tt> </tt>
<a name="L127"></a><tt class="py-lineno">127</tt> <tt class="py-line"><tt class="py-docstring"> -1.0</tt> </tt>
<a name="L128"></a><tt class="py-lineno">128</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
<a name="L129"></a><tt class="py-lineno">129</tt> <tt class="py-line"> </tt>
<a name="L130"></a><tt class="py-lineno">130</tt> <tt class="py-line"> <tt class="py-name">lines</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt><tt class="py-op">.</tt><tt id="link-19" class="py-name" targets="Method pyspark.context.SparkContext.textFile()=pyspark.context.SparkContext-class.html#textFile"><a title="pyspark.context.SparkContext.textFile" class="py-name" href="#" onclick="return doclink('link-19', 'textFile', 'link-19');">textFile</a></tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">,</tt> <tt class="py-name">minPartitions</tt><tt class="py-op">)</tt> </tt>
<a name="L131"></a><tt class="py-lineno">131</tt> <tt class="py-line"> <tt class="py-name">parsed</tt> <tt class="py-op">=</tt> <tt class="py-name">lines</tt><tt class="py-op">.</tt><tt id="link-20" class="py-name" targets="Method pyspark.rdd.RDD.map()=pyspark.rdd.RDD-class.html#map"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-20', 'map', 'link-20');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">l</tt><tt class="py-op">:</tt> <tt id="link-21" class="py-name" targets="Class pyspark.mllib.util.MLUtils=pyspark.mllib.util.MLUtils-class.html"><a title="pyspark.mllib.util.MLUtils" class="py-name" href="#" onclick="return doclink('link-21', 'MLUtils', 'link-21');">MLUtils</a></tt><tt class="py-op">.</tt><tt class="py-name">_parse_libsvm_line</tt><tt class="py-op">(</tt><tt class="py-name">l</tt><tt class="py-op">,</tt> <tt class="py-name">multiclass</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L132"></a><tt class="py-lineno">132</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">numFeatures</tt> <tt class="py-op"><=</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
<a name="L133"></a><tt class="py-lineno">133</tt> <tt class="py-line"> <tt class="py-name">parsed</tt><tt class="py-op">.</tt><tt id="link-22" class="py-name" targets="Method pyspark.rdd.RDD.cache()=pyspark.rdd.RDD-class.html#cache,Method pyspark.sql.SchemaRDD.cache()=pyspark.sql.SchemaRDD-class.html#cache"><a title="pyspark.rdd.RDD.cache
pyspark.sql.SchemaRDD.cache" class="py-name" href="#" onclick="return doclink('link-22', 'cache', 'link-22');">cache</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L134"></a><tt class="py-lineno">134</tt> <tt class="py-line"> <tt class="py-name">numFeatures</tt> <tt class="py-op">=</tt> <tt class="py-name">parsed</tt><tt class="py-op">.</tt><tt id="link-23" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-23', 'map', 'link-20');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-number">0</tt> <tt class="py-keyword">if</tt> <tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">size</tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt> <tt class="py-keyword">else</tt> <tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">[</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-24" class="py-name" targets="Method pyspark.rdd.RDD.reduce()=pyspark.rdd.RDD-class.html#reduce"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-24', 'reduce', 'link-24');">reduce</a></tt><tt class="py-op">(</tt><tt id="link-25" class="py-name" targets="Method pyspark.rdd.RDD.max()=pyspark.rdd.RDD-class.html#max,Method pyspark.statcounter.StatCounter.max()=pyspark.statcounter.StatCounter-class.html#max"><a title="pyspark.rdd.RDD.max
pyspark.statcounter.StatCounter.max" class="py-name" href="#" onclick="return doclink('link-25', 'max', 'link-25');">max</a></tt><tt class="py-op">)</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt> </tt>
<a name="L135"></a><tt class="py-lineno">135</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">parsed</tt><tt class="py-op">.</tt><tt id="link-26" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-26', 'map', 'link-20');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt id="link-27" class="py-name"><a title="pyspark.mllib.regression.LabeledPoint" class="py-name" href="#" onclick="return doclink('link-27', 'LabeledPoint', 'link-8');">LabeledPoint</a></tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt id="link-28" class="py-name"><a title="pyspark.mllib.linalg.Vectors" class="py-name" href="#" onclick="return doclink('link-28', 'Vectors', 'link-3');">Vectors</a></tt><tt class="py-op">.</tt><tt id="link-29" class="py-name" targets="Static Method pyspark.mllib.linalg.Vectors.sparse()=pyspark.mllib.linalg.Vectors-class.html#sparse"><a title="pyspark.mllib.linalg.Vectors.sparse" class="py-name" href="#" onclick="return doclink('link-29', 'sparse', 'link-29');">sparse</a></tt><tt class="py-op">(</tt><tt class="py-name">numFeatures</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">2</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L136"></a><tt class="py-lineno">136</tt> <tt class="py-line"> </tt>
<a name="L137"></a><tt class="py-lineno">137</tt> <tt class="py-line"> <tt class="py-decorator">@</tt><tt class="py-decorator">staticmethod</tt> </tt>
<a name="MLUtils.saveAsLibSVMFile"></a><div id="MLUtils.saveAsLibSVMFile-def"><a name="L138"></a><tt class="py-lineno">138</tt> <a class="py-toggle" href="#" id="MLUtils.saveAsLibSVMFile-toggle" onclick="return toggle('MLUtils.saveAsLibSVMFile');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.mllib.util.MLUtils-class.html#saveAsLibSVMFile">saveAsLibSVMFile</a><tt class="py-op">(</tt><tt class="py-param">data</tt><tt class="py-op">,</tt> <tt class="py-param">dir</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="MLUtils.saveAsLibSVMFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="MLUtils.saveAsLibSVMFile-expanded"><a name="L139"></a><tt class="py-lineno">139</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
<a name="L140"></a><tt class="py-lineno">140</tt> <tt class="py-line"><tt class="py-docstring"> Save labeled data in LIBSVM format.</tt> </tt>
<a name="L141"></a><tt class="py-lineno">141</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L142"></a><tt class="py-lineno">142</tt> <tt class="py-line"><tt class="py-docstring"> @param data: an RDD of LabeledPoint to be saved</tt> </tt>
<a name="L143"></a><tt class="py-lineno">143</tt> <tt class="py-line"><tt class="py-docstring"> @param dir: directory to save the data</tt> </tt>
<a name="L144"></a><tt class="py-lineno">144</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L145"></a><tt class="py-lineno">145</tt> <tt class="py-line"><tt class="py-docstring"> >>> from tempfile import NamedTemporaryFile</tt> </tt>
<a name="L146"></a><tt class="py-lineno">146</tt> <tt class="py-line"><tt class="py-docstring"> >>> from fileinput import input</tt> </tt>
<a name="L147"></a><tt class="py-lineno">147</tt> <tt class="py-line"><tt class="py-docstring"> >>> from glob import glob</tt> </tt>
<a name="L148"></a><tt class="py-lineno">148</tt> <tt class="py-line"><tt class="py-docstring"> >>> from pyspark.mllib.util import MLUtils</tt> </tt>
<a name="L149"></a><tt class="py-lineno">149</tt> <tt class="py-line"><tt class="py-docstring"> >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \</tt> </tt>
<a name="L150"></a><tt class="py-lineno">150</tt> <tt class="py-line"><tt class="py-docstring"> LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]</tt> </tt>
<a name="L151"></a><tt class="py-lineno">151</tt> <tt class="py-line"><tt class="py-docstring"> >>> tempFile = NamedTemporaryFile(delete=True)</tt> </tt>
<a name="L152"></a><tt class="py-lineno">152</tt> <tt class="py-line"><tt class="py-docstring"> >>> tempFile.close()</tt> </tt>
<a name="L153"></a><tt class="py-lineno">153</tt> <tt class="py-line"><tt class="py-docstring"> >>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)</tt> </tt>
<a name="L154"></a><tt class="py-lineno">154</tt> <tt class="py-line"><tt class="py-docstring"> >>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))</tt> </tt>
<a name="L155"></a><tt class="py-lineno">155</tt> <tt class="py-line"><tt class="py-docstring"> '0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'</tt> </tt>
<a name="L156"></a><tt class="py-lineno">156</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
<a name="L157"></a><tt class="py-lineno">157</tt> <tt class="py-line"> <tt class="py-name">lines</tt> <tt class="py-op">=</tt> <tt class="py-name">data</tt><tt class="py-op">.</tt><tt id="link-30" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-30', 'map', 'link-20');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">p</tt><tt class="py-op">:</tt> <tt id="link-31" class="py-name"><a title="pyspark.mllib.util.MLUtils" class="py-name" href="#" onclick="return doclink('link-31', 'MLUtils', 'link-21');">MLUtils</a></tt><tt class="py-op">.</tt><tt class="py-name">_convert_labeled_point_to_libsvm</tt><tt class="py-op">(</tt><tt class="py-name">p</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L158"></a><tt class="py-lineno">158</tt> <tt class="py-line"> <tt class="py-name">lines</tt><tt class="py-op">.</tt><tt id="link-32" class="py-name" targets="Method pyspark.rdd.RDD.saveAsTextFile()=pyspark.rdd.RDD-class.html#saveAsTextFile"><a title="pyspark.rdd.RDD.saveAsTextFile" class="py-name" href="#" onclick="return doclink('link-32', 'saveAsTextFile', 'link-32');">saveAsTextFile</a></tt><tt class="py-op">(</tt><tt class="py-name">dir</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L159"></a><tt class="py-lineno">159</tt> <tt class="py-line"> </tt>
<a name="_test"></a><div id="_test-def"><a name="L160"></a><tt class="py-lineno">160</tt> <tt class="py-line"> </tt>
<a name="L161"></a><tt class="py-lineno">161</tt> <a class="py-toggle" href="#" id="_test-toggle" onclick="return toggle('_test');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.mllib.util-module.html#_test">_test</a><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_test-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="_test-expanded"><a name="L162"></a><tt class="py-lineno">162</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">doctest</tt> </tt>
<a name="L163"></a><tt class="py-lineno">163</tt> <tt class="py-line"> <tt class="py-keyword">from</tt> <tt id="link-33" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-33', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-34" class="py-name" targets="Module pyspark.context=pyspark.context-module.html,Method pyspark.rdd.RDD.context()=pyspark.rdd.RDD-class.html#context"><a title="pyspark.context
pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-34', 'context', 'link-34');">context</a></tt> <tt class="py-keyword">import</tt> <tt id="link-35" class="py-name" targets="Class pyspark.context.SparkContext=pyspark.context.SparkContext-class.html"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-35', 'SparkContext', 'link-35');">SparkContext</a></tt> </tt>
<a name="L164"></a><tt class="py-lineno">164</tt> <tt class="py-line"> <tt class="py-name">globs</tt> <tt class="py-op">=</tt> <tt class="py-name">globals</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-36" class="py-name" targets="Method pyspark.statcounter.StatCounter.copy()=pyspark.statcounter.StatCounter-class.html#copy"><a title="pyspark.statcounter.StatCounter.copy" class="py-name" href="#" onclick="return doclink('link-36', 'copy', 'link-36');">copy</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L165"></a><tt class="py-lineno">165</tt> <tt class="py-line"> <tt class="py-comment"># The small batch size here ensures that we see multiple batches,</tt> </tt>
<a name="L166"></a><tt class="py-lineno">166</tt> <tt class="py-line"> <tt class="py-comment"># even in these small test examples:</tt> </tt>
<a name="L167"></a><tt class="py-lineno">167</tt> <tt class="py-line"> <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt id="link-37" class="py-name"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-37', 'SparkContext', 'link-35');">SparkContext</a></tt><tt class="py-op">(</tt><tt class="py-string">'local[2]'</tt><tt class="py-op">,</tt> <tt class="py-string">'PythonTest'</tt><tt class="py-op">,</tt> <tt class="py-name">batchSize</tt><tt class="py-op">=</tt><tt class="py-number">2</tt><tt class="py-op">)</tt> </tt>
<a name="L168"></a><tt class="py-lineno">168</tt> <tt class="py-line"> <tt class="py-op">(</tt><tt class="py-name">failure_count</tt><tt class="py-op">,</tt> <tt class="py-name">test_count</tt><tt class="py-op">)</tt> <tt class="py-op">=</tt> <tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">testmod</tt><tt class="py-op">(</tt><tt class="py-name">globs</tt><tt class="py-op">=</tt><tt class="py-name">globs</tt><tt class="py-op">,</tt> <tt class="py-name">optionflags</tt><tt class="py-op">=</tt><tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">ELLIPSIS</tt><tt class="py-op">)</tt> </tt>
<a name="L169"></a><tt class="py-lineno">169</tt> <tt class="py-line"> <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt id="link-38" class="py-name" targets="Method pyspark.context.SparkContext.stop()=pyspark.context.SparkContext-class.html#stop"><a title="pyspark.context.SparkContext.stop" class="py-name" href="#" onclick="return doclink('link-38', 'stop', 'link-38');">stop</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L170"></a><tt class="py-lineno">170</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">failure_count</tt><tt class="py-op">:</tt> </tt>
<a name="L171"></a><tt class="py-lineno">171</tt> <tt class="py-line"> <tt class="py-name">exit</tt><tt class="py-op">(</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
</div><a name="L172"></a><tt class="py-lineno">172</tt> <tt class="py-line"> </tt>
<a name="L173"></a><tt class="py-lineno">173</tt> <tt class="py-line"> </tt>
<a name="L174"></a><tt class="py-lineno">174</tt> <tt class="py-line"><tt class="py-keyword">if</tt> <tt class="py-name">__name__</tt> <tt class="py-op">==</tt> <tt class="py-string">"__main__"</tt><tt class="py-op">:</tt> </tt>
<a name="L175"></a><tt class="py-lineno">175</tt> <tt class="py-line"> <tt class="py-name">_test</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L176"></a><tt class="py-lineno">176</tt> <tt class="py-line"> </tt><script type="text/javascript">
<!--
expandto(location.href);
// -->
</script>
</pre>
<br />
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
bgcolor="#a0c0ff" cellspacing="0">
<tr valign="middle">
<!-- Home link -->
<th> <a
href="pyspark-module.html">Home</a> </th>
<!-- Tree link -->
<th> <a
href="module-tree.html">Trees</a> </th>
<!-- Index link -->
<th> <a
href="identifier-index.html">Indices</a> </th>
<!-- Help link -->
<th> <a
href="help.html">Help</a> </th>
<!-- Project homepage -->
<th class="navbar" align="right" width="100%">
<table border="0" cellpadding="0" cellspacing="0">
<tr><th class="navbar" align="center"
><a class="navbar" target="_top" href="http://spark.apache.org">Spark 1.0.1 Python API Docs</a></th>
</tr></table></th>
</tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="100%%">
<tr>
<td align="left" class="footer">
Generated by Epydoc 3.0.1 on Fri Jul 4 18:52:27 2014
</td>
<td align="right" class="footer">
<a target="mainFrame" href="http://epydoc.sourceforge.net"
>http://epydoc.sourceforge.net</a>
</td>
</tr>
</table>
<script type="text/javascript">
<!--
// Private objects are initially displayed (because if
// javascript is turned off then we want them to be
// visible); but by default, we want to hide them. So hide
// them unless we have a cookie that says to show them.
checkCookie();
// -->
</script>
</body>
</html>
|