summaryrefslogtreecommitdiff
path: root/site/docs/0.7.0/api/pyspark/pyspark.rdd-pysrc.html
diff options
context:
space:
mode:
Diffstat (limited to 'site/docs/0.7.0/api/pyspark/pyspark.rdd-pysrc.html')
-rw-r--r--site/docs/0.7.0/api/pyspark/pyspark.rdd-pysrc.html884
1 files changed, 884 insertions, 0 deletions
diff --git a/site/docs/0.7.0/api/pyspark/pyspark.rdd-pysrc.html b/site/docs/0.7.0/api/pyspark/pyspark.rdd-pysrc.html
new file mode 100644
index 000000000..398b52fe3
--- /dev/null
+++ b/site/docs/0.7.0/api/pyspark/pyspark.rdd-pysrc.html
@@ -0,0 +1,884 @@
+<?xml version="1.0" encoding="ascii"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>pyspark.rdd</title>
+ <link rel="stylesheet" href="epydoc.css" type="text/css" />
+ <script type="text/javascript" src="epydoc.js"></script>
+</head>
+
+<body bgcolor="white" text="black" link="blue" vlink="#204080"
+ alink="#204080">
+<!-- ==================== NAVIGATION BAR ==================== -->
+<table class="navbar" border="0" width="100%" cellpadding="0"
+ bgcolor="#a0c0ff" cellspacing="0">
+ <tr valign="middle">
+ <!-- Home link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Tree link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Index link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Help link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Project homepage -->
+ <th class="navbar" align="right" width="100%">
+ <table border="0" cellpadding="0" cellspacing="0">
+ <tr><th class="navbar" align="center"
+ ><a class="navbar" target="_top" href="http://spark-project.org">PySpark</a></th>
+ </tr></table></th>
+ </tr>
+</table>
+<table width="100%" cellpadding="0" cellspacing="0">
+ <tr valign="top">
+ <td width="100%">
+ <span class="breadcrumbs">
+ <a href="pyspark-module.html">Package&nbsp;pyspark</a> ::
+ Module&nbsp;rdd
+ </span>
+ </td>
+ <td>
+ <table cellpadding="0" cellspacing="0">
+ <!-- hide/show private -->
+ <tr><td align="right"><span class="options"
+ >[<a href="frames.html" target="_top">frames</a
+ >]&nbsp;|&nbsp;<a href="pyspark.rdd-pysrc.html"
+ target="_top">no&nbsp;frames</a>]</span></td></tr>
+ </table>
+ </td>
+ </tr>
+</table>
+<h1 class="epydoc">Source Code for <a href="pyspark.rdd-module.html">Module pyspark.rdd</a></h1>
+<pre class="py-src">
+<a name="L1"></a><tt class="py-lineno"> 1</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">base64</tt> <tt class="py-keyword">import</tt> <tt class="py-name">standard_b64encode</tt> <tt class="py-keyword">as</tt> <tt class="py-name">b64enc</tt> </tt>
+<a name="L2"></a><tt class="py-lineno"> 2</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">copy</tt> </tt>
+<a name="L3"></a><tt class="py-lineno"> 3</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">collections</tt> <tt class="py-keyword">import</tt> <tt class="py-name">defaultdict</tt> </tt>
+<a name="L4"></a><tt class="py-lineno"> 4</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">itertools</tt> <tt class="py-keyword">import</tt> <tt class="py-name">chain</tt><tt class="py-op">,</tt> <tt class="py-name">ifilter</tt><tt class="py-op">,</tt> <tt class="py-name">imap</tt><tt class="py-op">,</tt> <tt class="py-name">product</tt> </tt>
+<a name="L5"></a><tt class="py-lineno"> 5</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">operator</tt> </tt>
+<a name="L6"></a><tt class="py-lineno"> 6</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">os</tt> </tt>
+<a name="L7"></a><tt class="py-lineno"> 7</tt> <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">shlex</tt> </tt>
+<a name="L8"></a><tt class="py-lineno"> 8</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">subprocess</tt> <tt class="py-keyword">import</tt> <tt class="py-name">Popen</tt><tt class="py-op">,</tt> <tt class="py-name">PIPE</tt> </tt>
+<a name="L9"></a><tt class="py-lineno"> 9</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">tempfile</tt> <tt class="py-keyword">import</tt> <tt class="py-name">NamedTemporaryFile</tt> </tt>
+<a name="L10"></a><tt class="py-lineno"> 10</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">threading</tt> <tt class="py-keyword">import</tt> <tt class="py-name">Thread</tt> </tt>
+<a name="L11"></a><tt class="py-lineno"> 11</tt> <tt class="py-line"> </tt>
+<a name="L12"></a><tt class="py-lineno"> 12</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-0" class="py-name" targets="Package pyspark=pyspark-module.html"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-0', 'pyspark', 'link-0');">pyspark</a></tt> <tt class="py-keyword">import</tt> <tt class="py-name">cloudpickle</tt> </tt>
+<a name="L13"></a><tt class="py-lineno"> 13</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-1" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-1', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt class="py-name">serializers</tt> <tt class="py-keyword">import</tt> <tt class="py-name">batched</tt><tt class="py-op">,</tt> <tt class="py-name">Batch</tt><tt class="py-op">,</tt> <tt class="py-name">dump_pickle</tt><tt class="py-op">,</tt> <tt class="py-name">load_pickle</tt><tt class="py-op">,</tt> \ </tt>
+<a name="L14"></a><tt class="py-lineno"> 14</tt> <tt class="py-line"> <tt class="py-name">read_from_pickle_file</tt> </tt>
+<a name="L15"></a><tt class="py-lineno"> 15</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-2" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-2', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-3" class="py-name" targets="Method pyspark.rdd.RDD.join()=pyspark.rdd.RDD-class.html#join"><a title="pyspark.rdd.RDD.join" class="py-name" href="#" onclick="return doclink('link-3', 'join', 'link-3');">join</a></tt> <tt class="py-keyword">import</tt> <tt class="py-name">python_join</tt><tt class="py-op">,</tt> <tt class="py-name">python_left_outer_join</tt><tt class="py-op">,</tt> \ </tt>
+<a name="L16"></a><tt class="py-lineno"> 16</tt> <tt class="py-line"> <tt class="py-name">python_right_outer_join</tt><tt class="py-op">,</tt> <tt class="py-name">python_cogroup</tt> </tt>
+<a name="L17"></a><tt class="py-lineno"> 17</tt> <tt class="py-line"> </tt>
+<a name="L18"></a><tt class="py-lineno"> 18</tt> <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">py4j</tt><tt class="py-op">.</tt><tt class="py-name">java_collections</tt> <tt class="py-keyword">import</tt> <tt class="py-name">ListConverter</tt><tt class="py-op">,</tt> <tt class="py-name">MapConverter</tt> </tt>
+<a name="L19"></a><tt class="py-lineno"> 19</tt> <tt class="py-line"> </tt>
+<a name="L20"></a><tt class="py-lineno"> 20</tt> <tt class="py-line"> </tt>
+<a name="L21"></a><tt class="py-lineno"> 21</tt> <tt class="py-line"><tt class="py-name">__all__</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-string">"RDD"</tt><tt class="py-op">]</tt> </tt>
+<a name="RDD"></a><div id="RDD-def"><a name="L22"></a><tt class="py-lineno"> 22</tt> <tt class="py-line"> </tt>
+<a name="L23"></a><tt class="py-lineno"> 23</tt> <tt class="py-line"> </tt>
+<a name="L24"></a><tt class="py-lineno"> 24</tt> <a class="py-toggle" href="#" id="RDD-toggle" onclick="return toggle('RDD');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html">RDD</a><tt class="py-op">(</tt><tt class="py-base-class">object</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="RDD-expanded"><a name="L25"></a><tt class="py-lineno"> 25</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L26"></a><tt class="py-lineno"> 26</tt> <tt class="py-line"><tt class="py-docstring"> A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.</tt> </tt>
+<a name="L27"></a><tt class="py-lineno"> 27</tt> <tt class="py-line"><tt class="py-docstring"> Represents an immutable, partitioned collection of elements that can be</tt> </tt>
+<a name="L28"></a><tt class="py-lineno"> 28</tt> <tt class="py-line"><tt class="py-docstring"> operated on in parallel.</tt> </tt>
+<a name="L29"></a><tt class="py-lineno"> 29</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L30"></a><tt class="py-lineno"> 30</tt> <tt class="py-line"> </tt>
+<a name="RDD.__init__"></a><div id="RDD.__init__-def"><a name="L31"></a><tt class="py-lineno"> 31</tt> <a class="py-toggle" href="#" id="RDD.__init__-toggle" onclick="return toggle('RDD.__init__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">jrdd</tt><tt class="py-op">,</tt> <tt class="py-param">ctx</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.__init__-expanded"><a name="L32"></a><tt class="py-lineno"> 32</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">jrdd</tt> </tt>
+<a name="L33"></a><tt class="py-lineno"> 33</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
+<a name="L34"></a><tt class="py-lineno"> 34</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
+<a name="L35"></a><tt class="py-lineno"> 35</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> <tt class="py-op">=</tt> <tt class="py-name">ctx</tt> </tt>
+<a name="L36"></a><tt class="py-lineno"> 36</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_partitionFunc</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt> </tt>
+</div><a name="L37"></a><tt class="py-lineno"> 37</tt> <tt class="py-line"> </tt>
+<a name="L38"></a><tt class="py-lineno"> 38</tt> <tt class="py-line"> <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
+<a name="RDD.context"></a><div id="RDD.context-def"><a name="L39"></a><tt class="py-lineno"> 39</tt> <a class="py-toggle" href="#" id="RDD.context-toggle" onclick="return toggle('RDD.context');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#context">context</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.context-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.context-expanded"><a name="L40"></a><tt class="py-lineno"> 40</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L41"></a><tt class="py-lineno"> 41</tt> <tt class="py-line"><tt class="py-docstring"> The L{SparkContext} that this RDD was created on.</tt> </tt>
+<a name="L42"></a><tt class="py-lineno"> 42</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L43"></a><tt class="py-lineno"> 43</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> </tt>
+</div><a name="L44"></a><tt class="py-lineno"> 44</tt> <tt class="py-line"> </tt>
+<a name="RDD.cache"></a><div id="RDD.cache-def"><a name="L45"></a><tt class="py-lineno"> 45</tt> <a class="py-toggle" href="#" id="RDD.cache-toggle" onclick="return toggle('RDD.cache');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#cache">cache</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.cache-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.cache-expanded"><a name="L46"></a><tt class="py-lineno"> 46</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L47"></a><tt class="py-lineno"> 47</tt> <tt class="py-line"><tt class="py-docstring"> Persist this RDD with the default storage level (C{MEMORY_ONLY}).</tt> </tt>
+<a name="L48"></a><tt class="py-lineno"> 48</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L49"></a><tt class="py-lineno"> 49</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
+<a name="L50"></a><tt class="py-lineno"> 50</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-4" class="py-name" targets="Method pyspark.rdd.RDD.cache()=pyspark.rdd.RDD-class.html#cache"><a title="pyspark.rdd.RDD.cache" class="py-name" href="#" onclick="return doclink('link-4', 'cache', 'link-4');">cache</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L51"></a><tt class="py-lineno"> 51</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
+</div><a name="L52"></a><tt class="py-lineno"> 52</tt> <tt class="py-line"> </tt>
+<a name="RDD.checkpoint"></a><div id="RDD.checkpoint-def"><a name="L53"></a><tt class="py-lineno"> 53</tt> <a class="py-toggle" href="#" id="RDD.checkpoint-toggle" onclick="return toggle('RDD.checkpoint');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#checkpoint">checkpoint</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.checkpoint-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.checkpoint-expanded"><a name="L54"></a><tt class="py-lineno"> 54</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L55"></a><tt class="py-lineno"> 55</tt> <tt class="py-line"><tt class="py-docstring"> Mark this RDD for checkpointing. It will be saved to a file inside the</tt> </tt>
+<a name="L56"></a><tt class="py-lineno"> 56</tt> <tt class="py-line"><tt class="py-docstring"> checkpoint directory set with L{SparkContext.setCheckpointDir()} and</tt> </tt>
+<a name="L57"></a><tt class="py-lineno"> 57</tt> <tt class="py-line"><tt class="py-docstring"> all references to its parent RDDs will be removed. This function must</tt> </tt>
+<a name="L58"></a><tt class="py-lineno"> 58</tt> <tt class="py-line"><tt class="py-docstring"> be called before any job has been executed on this RDD. It is strongly</tt> </tt>
+<a name="L59"></a><tt class="py-lineno"> 59</tt> <tt class="py-line"><tt class="py-docstring"> recommended that this RDD is persisted in memory, otherwise saving it</tt> </tt>
+<a name="L60"></a><tt class="py-lineno"> 60</tt> <tt class="py-line"><tt class="py-docstring"> on a file will require recomputation.</tt> </tt>
+<a name="L61"></a><tt class="py-lineno"> 61</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L62"></a><tt class="py-lineno"> 62</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
+<a name="L63"></a><tt class="py-lineno"> 63</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-5" class="py-name" targets="Module pyspark.rdd=pyspark.rdd-module.html"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-5', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-6" class="py-name" targets="Method pyspark.rdd.RDD.checkpoint()=pyspark.rdd.RDD-class.html#checkpoint"><a title="pyspark.rdd.RDD.checkpoint" class="py-name" href="#" onclick="return doclink('link-6', 'checkpoint', 'link-6');">checkpoint</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L64"></a><tt class="py-lineno"> 64</tt> <tt class="py-line"> </tt>
+<a name="RDD.isCheckpointed"></a><div id="RDD.isCheckpointed-def"><a name="L65"></a><tt class="py-lineno"> 65</tt> <a class="py-toggle" href="#" id="RDD.isCheckpointed-toggle" onclick="return toggle('RDD.isCheckpointed');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#isCheckpointed">isCheckpointed</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.isCheckpointed-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.isCheckpointed-expanded"><a name="L66"></a><tt class="py-lineno"> 66</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L67"></a><tt class="py-lineno"> 67</tt> <tt class="py-line"><tt class="py-docstring"> Return whether this RDD has been checkpointed or not</tt> </tt>
+<a name="L68"></a><tt class="py-lineno"> 68</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L69"></a><tt class="py-lineno"> 69</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-7" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-7', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-8" class="py-name" targets="Method pyspark.rdd.RDD.isCheckpointed()=pyspark.rdd.RDD-class.html#isCheckpointed"><a title="pyspark.rdd.RDD.isCheckpointed" class="py-name" href="#" onclick="return doclink('link-8', 'isCheckpointed', 'link-8');">isCheckpointed</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L70"></a><tt class="py-lineno"> 70</tt> <tt class="py-line"> </tt>
+<a name="RDD.getCheckpointFile"></a><div id="RDD.getCheckpointFile-def"><a name="L71"></a><tt class="py-lineno"> 71</tt> <a class="py-toggle" href="#" id="RDD.getCheckpointFile-toggle" onclick="return toggle('RDD.getCheckpointFile');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#getCheckpointFile">getCheckpointFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.getCheckpointFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.getCheckpointFile-expanded"><a name="L72"></a><tt class="py-lineno"> 72</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L73"></a><tt class="py-lineno"> 73</tt> <tt class="py-line"><tt class="py-docstring"> Gets the name of the file to which this RDD was checkpointed</tt> </tt>
+<a name="L74"></a><tt class="py-lineno"> 74</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L75"></a><tt class="py-lineno"> 75</tt> <tt class="py-line"> <tt class="py-name">checkpointFile</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-9" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-9', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-10" class="py-name" targets="Method pyspark.rdd.RDD.getCheckpointFile()=pyspark.rdd.RDD-class.html#getCheckpointFile"><a title="pyspark.rdd.RDD.getCheckpointFile" class="py-name" href="#" onclick="return doclink('link-10', 'getCheckpointFile', 'link-10');">getCheckpointFile</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L76"></a><tt class="py-lineno"> 76</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">checkpointFile</tt><tt class="py-op">.</tt><tt class="py-name">isDefined</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L77"></a><tt class="py-lineno"> 77</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">checkpointFile</tt><tt class="py-op">.</tt><tt id="link-11" class="py-name" targets="Class Method pyspark.files.SparkFiles.get()=pyspark.files.SparkFiles-class.html#get"><a title="pyspark.files.SparkFiles.get" class="py-name" href="#" onclick="return doclink('link-11', 'get', 'link-11');">get</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L78"></a><tt class="py-lineno"> 78</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
+<a name="L79"></a><tt class="py-lineno"> 79</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">None</tt> </tt>
+</div><a name="L80"></a><tt class="py-lineno"> 80</tt> <tt class="py-line"> </tt>
+<a name="L81"></a><tt class="py-lineno"> 81</tt> <tt class="py-line"> <tt class="py-comment"># TODO persist(self, storageLevel)</tt> </tt>
+<a name="L82"></a><tt class="py-lineno"> 82</tt> <tt class="py-line"> </tt>
+<a name="RDD.map"></a><div id="RDD.map-def"><a name="L83"></a><tt class="py-lineno"> 83</tt> <a class="py-toggle" href="#" id="RDD.map-toggle" onclick="return toggle('RDD.map');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#map">map</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.map-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.map-expanded"><a name="L84"></a><tt class="py-lineno"> 84</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L85"></a><tt class="py-lineno"> 85</tt> <tt class="py-line"><tt class="py-docstring"> Return a new RDD containing the distinct elements in this RDD.</tt> </tt>
+<a name="L86"></a><tt class="py-lineno"> 86</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L87"></a><tt class="py-lineno"> 87</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">imap</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
+<a name="L88"></a><tt class="py-lineno"> 88</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L89"></a><tt class="py-lineno"> 89</tt> <tt class="py-line"> </tt>
+<a name="RDD.flatMap"></a><div id="RDD.flatMap-def"><a name="L90"></a><tt class="py-lineno"> 90</tt> <a class="py-toggle" href="#" id="RDD.flatMap-toggle" onclick="return toggle('RDD.flatMap');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#flatMap">flatMap</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.flatMap-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.flatMap-expanded"><a name="L91"></a><tt class="py-lineno"> 91</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L92"></a><tt class="py-lineno"> 92</tt> <tt class="py-line"><tt class="py-docstring"> Return a new RDD by first applying a function to all elements of this</tt> </tt>
+<a name="L93"></a><tt class="py-lineno"> 93</tt> <tt class="py-line"><tt class="py-docstring"> RDD, and then flattening the results.</tt> </tt>
+<a name="L94"></a><tt class="py-lineno"> 94</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L95"></a><tt class="py-lineno"> 95</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([2, 3, 4])</tt> </tt>
+<a name="L96"></a><tt class="py-lineno"> 96</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.flatMap(lambda x: range(1, x)).collect())</tt> </tt>
+<a name="L97"></a><tt class="py-lineno"> 97</tt> <tt class="py-line"><tt class="py-docstring"> [1, 1, 1, 2, 2, 3]</tt> </tt>
+<a name="L98"></a><tt class="py-lineno"> 98</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect())</tt> </tt>
+<a name="L99"></a><tt class="py-lineno"> 99</tt> <tt class="py-line"><tt class="py-docstring"> [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]</tt> </tt>
+<a name="L100"></a><tt class="py-lineno">100</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L101"></a><tt class="py-lineno">101</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">s</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">chain</tt><tt class="py-op">.</tt><tt class="py-name">from_iterable</tt><tt class="py-op">(</tt><tt class="py-name">imap</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+<a name="L102"></a><tt class="py-lineno">102</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-12" class="py-name" targets="Method pyspark.rdd.RDD.mapPartitionsWithSplit()=pyspark.rdd.RDD-class.html#mapPartitionsWithSplit"><a title="pyspark.rdd.RDD.mapPartitionsWithSplit" class="py-name" href="#" onclick="return doclink('link-12', 'mapPartitionsWithSplit', 'link-12');">mapPartitionsWithSplit</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L103"></a><tt class="py-lineno">103</tt> <tt class="py-line"> </tt>
+<a name="RDD.mapPartitions"></a><div id="RDD.mapPartitions-def"><a name="L104"></a><tt class="py-lineno">104</tt> <a class="py-toggle" href="#" id="RDD.mapPartitions-toggle" onclick="return toggle('RDD.mapPartitions');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapPartitions">mapPartitions</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.mapPartitions-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.mapPartitions-expanded"><a name="L105"></a><tt class="py-lineno">105</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L106"></a><tt class="py-lineno">106</tt> <tt class="py-line"><tt class="py-docstring"> Return a new RDD by applying a function to each partition of this RDD.</tt> </tt>
+<a name="L107"></a><tt class="py-lineno">107</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L108"></a><tt class="py-lineno">108</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 2)</tt> </tt>
+<a name="L109"></a><tt class="py-lineno">109</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; def f(iterator): yield sum(iterator)</tt> </tt>
+<a name="L110"></a><tt class="py-lineno">110</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.mapPartitions(f).collect()</tt> </tt>
+<a name="L111"></a><tt class="py-lineno">111</tt> <tt class="py-line"><tt class="py-docstring"> [3, 7]</tt> </tt>
+<a name="L112"></a><tt class="py-lineno">112</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L113"></a><tt class="py-lineno">113</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">s</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
+<a name="L114"></a><tt class="py-lineno">114</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-13" class="py-name"><a title="pyspark.rdd.RDD.mapPartitionsWithSplit" class="py-name" href="#" onclick="return doclink('link-13', 'mapPartitionsWithSplit', 'link-12');">mapPartitionsWithSplit</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L115"></a><tt class="py-lineno">115</tt> <tt class="py-line"> </tt>
+<a name="RDD.mapPartitionsWithSplit"></a><div id="RDD.mapPartitionsWithSplit-def"><a name="L116"></a><tt class="py-lineno">116</tt> <a class="py-toggle" href="#" id="RDD.mapPartitionsWithSplit-toggle" onclick="return toggle('RDD.mapPartitionsWithSplit');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapPartitionsWithSplit">mapPartitionsWithSplit</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.mapPartitionsWithSplit-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.mapPartitionsWithSplit-expanded"><a name="L117"></a><tt class="py-lineno">117</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L118"></a><tt class="py-lineno">118</tt> <tt class="py-line"><tt class="py-docstring"> Return a new RDD by applying a function to each partition of this RDD,</tt> </tt>
+<a name="L119"></a><tt class="py-lineno">119</tt> <tt class="py-line"><tt class="py-docstring"> while tracking the index of the original partition.</tt> </tt>
+<a name="L120"></a><tt class="py-lineno">120</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L121"></a><tt class="py-lineno">121</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 4)</tt> </tt>
+<a name="L122"></a><tt class="py-lineno">122</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; def f(splitIndex, iterator): yield splitIndex</tt> </tt>
+<a name="L123"></a><tt class="py-lineno">123</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.mapPartitionsWithSplit(f).sum()</tt> </tt>
+<a name="L124"></a><tt class="py-lineno">124</tt> <tt class="py-line"><tt class="py-docstring"> 6</tt> </tt>
+<a name="L125"></a><tt class="py-lineno">125</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L126"></a><tt class="py-lineno">126</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L127"></a><tt class="py-lineno">127</tt> <tt class="py-line"> </tt>
+<a name="RDD.filter"></a><div id="RDD.filter-def"><a name="L128"></a><tt class="py-lineno">128</tt> <a class="py-toggle" href="#" id="RDD.filter-toggle" onclick="return toggle('RDD.filter');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#filter">filter</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.filter-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.filter-expanded"><a name="L129"></a><tt class="py-lineno">129</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L130"></a><tt class="py-lineno">130</tt> <tt class="py-line"><tt class="py-docstring"> Return a new RDD containing only the elements that satisfy a predicate.</tt> </tt>
+<a name="L131"></a><tt class="py-lineno">131</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L132"></a><tt class="py-lineno">132</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4, 5])</tt> </tt>
+<a name="L133"></a><tt class="py-lineno">133</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.filter(lambda x: x % 2 == 0).collect()</tt> </tt>
+<a name="L134"></a><tt class="py-lineno">134</tt> <tt class="py-line"><tt class="py-docstring"> [2, 4]</tt> </tt>
+<a name="L135"></a><tt class="py-lineno">135</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L136"></a><tt class="py-lineno">136</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">ifilter</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
+<a name="L137"></a><tt class="py-lineno">137</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-14" class="py-name" targets="Method pyspark.rdd.RDD.mapPartitions()=pyspark.rdd.RDD-class.html#mapPartitions"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-14', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L138"></a><tt class="py-lineno">138</tt> <tt class="py-line"> </tt>
+<a name="RDD.distinct"></a><div id="RDD.distinct-def"><a name="L139"></a><tt class="py-lineno">139</tt> <a class="py-toggle" href="#" id="RDD.distinct-toggle" onclick="return toggle('RDD.distinct');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#distinct">distinct</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.distinct-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.distinct-expanded"><a name="L140"></a><tt class="py-lineno">140</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L141"></a><tt class="py-lineno">141</tt> <tt class="py-line"><tt class="py-docstring"> Return a new RDD containing the distinct elements in this RDD.</tt> </tt>
+<a name="L142"></a><tt class="py-lineno">142</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L143"></a><tt class="py-lineno">143</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())</tt> </tt>
+<a name="L144"></a><tt class="py-lineno">144</tt> <tt class="py-line"><tt class="py-docstring"> [1, 2, 3]</tt> </tt>
+<a name="L145"></a><tt class="py-lineno">145</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L146"></a><tt class="py-lineno">146</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-15" class="py-name" targets="Method pyspark.rdd.RDD.map()=pyspark.rdd.RDD-class.html#map"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-15', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-string">""</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> \ </tt>
+<a name="L147"></a><tt class="py-lineno">147</tt> <tt class="py-line"> <tt class="py-op">.</tt><tt id="link-16" class="py-name" targets="Method pyspark.rdd.RDD.reduceByKey()=pyspark.rdd.RDD-class.html#reduceByKey"><a title="pyspark.rdd.RDD.reduceByKey" class="py-name" href="#" onclick="return doclink('link-16', 'reduceByKey', 'link-16');">reduceByKey</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">_</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt> \ </tt>
+<a name="L148"></a><tt class="py-lineno">148</tt> <tt class="py-line"> <tt class="py-op">.</tt><tt id="link-17" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-17', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">_</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L149"></a><tt class="py-lineno">149</tt> <tt class="py-line"> </tt>
+<a name="L150"></a><tt class="py-lineno">150</tt> <tt class="py-line"> <tt class="py-comment"># TODO: sampling needs to be re-implemented due to Batch</tt> </tt>
+<a name="L151"></a><tt class="py-lineno">151</tt> <tt class="py-line"> <tt class="py-comment">#def sample(self, withReplacement, fraction, seed):</tt> </tt>
+<a name="L152"></a><tt class="py-lineno">152</tt> <tt class="py-line"> <tt class="py-comment"># jrdd = self._jrdd.sample(withReplacement, fraction, seed)</tt> </tt>
+<a name="L153"></a><tt class="py-lineno">153</tt> <tt class="py-line"> <tt class="py-comment"># return RDD(jrdd, self.ctx)</tt> </tt>
+<a name="L154"></a><tt class="py-lineno">154</tt> <tt class="py-line"> </tt>
+<a name="L155"></a><tt class="py-lineno">155</tt> <tt class="py-line"> <tt class="py-comment">#def takeSample(self, withReplacement, num, seed):</tt> </tt>
+<a name="L156"></a><tt class="py-lineno">156</tt> <tt class="py-line"> <tt class="py-comment"># vals = self._jrdd.takeSample(withReplacement, num, seed)</tt> </tt>
+<a name="L157"></a><tt class="py-lineno">157</tt> <tt class="py-line"> <tt class="py-comment"># return [load_pickle(bytes(x)) for x in vals]</tt> </tt>
+<a name="L158"></a><tt class="py-lineno">158</tt> <tt class="py-line"> </tt>
+<a name="RDD.union"></a><div id="RDD.union-def"><a name="L159"></a><tt class="py-lineno">159</tt> <a class="py-toggle" href="#" id="RDD.union-toggle" onclick="return toggle('RDD.union');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#union">union</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.union-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.union-expanded"><a name="L160"></a><tt class="py-lineno">160</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L161"></a><tt class="py-lineno">161</tt> <tt class="py-line"><tt class="py-docstring"> Return the union of this RDD and another one.</tt> </tt>
+<a name="L162"></a><tt class="py-lineno">162</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L163"></a><tt class="py-lineno">163</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 1, 2, 3])</tt> </tt>
+<a name="L164"></a><tt class="py-lineno">164</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.union(rdd).collect()</tt> </tt>
+<a name="L165"></a><tt class="py-lineno">165</tt> <tt class="py-line"><tt class="py-docstring"> [1, 1, 2, 3, 1, 1, 2, 3]</tt> </tt>
+<a name="L166"></a><tt class="py-lineno">166</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L167"></a><tt class="py-lineno">167</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-18" class="py-name" targets="Class pyspark.rdd.RDD=pyspark.rdd.RDD-class.html"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-18', 'RDD', 'link-18');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-19" class="py-name" targets="Method pyspark.context.SparkContext.union()=pyspark.context.SparkContext-class.html#union,Method pyspark.rdd.RDD.union()=pyspark.rdd.RDD-class.html#union"><a title="pyspark.context.SparkContext.union
+pyspark.rdd.RDD.union" class="py-name" href="#" onclick="return doclink('link-19', 'union', 'link-19');">union</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L168"></a><tt class="py-lineno">168</tt> <tt class="py-line"> </tt>
+<a name="RDD.__add__"></a><div id="RDD.__add__-def"><a name="L169"></a><tt class="py-lineno">169</tt> <a class="py-toggle" href="#" id="RDD.__add__-toggle" onclick="return toggle('RDD.__add__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#__add__">__add__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.__add__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.__add__-expanded"><a name="L170"></a><tt class="py-lineno">170</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L171"></a><tt class="py-lineno">171</tt> <tt class="py-line"><tt class="py-docstring"> Return the union of this RDD and another one.</tt> </tt>
+<a name="L172"></a><tt class="py-lineno">172</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L173"></a><tt class="py-lineno">173</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 1, 2, 3])</tt> </tt>
+<a name="L174"></a><tt class="py-lineno">174</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; (rdd + rdd).collect()</tt> </tt>
+<a name="L175"></a><tt class="py-lineno">175</tt> <tt class="py-line"><tt class="py-docstring"> [1, 1, 2, 3, 1, 1, 2, 3]</tt> </tt>
+<a name="L176"></a><tt class="py-lineno">176</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L177"></a><tt class="py-lineno">177</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">,</tt> <tt id="link-20" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-20', 'RDD', 'link-18');">RDD</a></tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L178"></a><tt class="py-lineno">178</tt> <tt class="py-line"> <tt class="py-keyword">raise</tt> <tt class="py-name">TypeError</tt> </tt>
+<a name="L179"></a><tt class="py-lineno">179</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-21" class="py-name"><a title="pyspark.context.SparkContext.union
+pyspark.rdd.RDD.union" class="py-name" href="#" onclick="return doclink('link-21', 'union', 'link-19');">union</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L180"></a><tt class="py-lineno">180</tt> <tt class="py-line"> </tt>
+<a name="L181"></a><tt class="py-lineno">181</tt> <tt class="py-line"> <tt class="py-comment"># TODO: sort</tt> </tt>
+<a name="L182"></a><tt class="py-lineno">182</tt> <tt class="py-line"> </tt>
+<a name="RDD.glom"></a><div id="RDD.glom-def"><a name="L183"></a><tt class="py-lineno">183</tt> <a class="py-toggle" href="#" id="RDD.glom-toggle" onclick="return toggle('RDD.glom');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#glom">glom</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.glom-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.glom-expanded"><a name="L184"></a><tt class="py-lineno">184</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L185"></a><tt class="py-lineno">185</tt> <tt class="py-line"><tt class="py-docstring"> Return an RDD created by coalescing all elements within each partition</tt> </tt>
+<a name="L186"></a><tt class="py-lineno">186</tt> <tt class="py-line"><tt class="py-docstring"> into a list.</tt> </tt>
+<a name="L187"></a><tt class="py-lineno">187</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L188"></a><tt class="py-lineno">188</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 2)</tt> </tt>
+<a name="L189"></a><tt class="py-lineno">189</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.glom().collect())</tt> </tt>
+<a name="L190"></a><tt class="py-lineno">190</tt> <tt class="py-line"><tt class="py-docstring"> [[1, 2], [3, 4]]</tt> </tt>
+<a name="L191"></a><tt class="py-lineno">191</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L192"></a><tt class="py-lineno">192</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">yield</tt> <tt class="py-name">list</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
+<a name="L193"></a><tt class="py-lineno">193</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-22" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-22', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L194"></a><tt class="py-lineno">194</tt> <tt class="py-line"> </tt>
+<a name="RDD.cartesian"></a><div id="RDD.cartesian-def"><a name="L195"></a><tt class="py-lineno">195</tt> <a class="py-toggle" href="#" id="RDD.cartesian-toggle" onclick="return toggle('RDD.cartesian');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#cartesian">cartesian</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.cartesian-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.cartesian-expanded"><a name="L196"></a><tt class="py-lineno">196</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L197"></a><tt class="py-lineno">197</tt> <tt class="py-line"><tt class="py-docstring"> Return the Cartesian product of this RDD and another one, that is, the</tt> </tt>
+<a name="L198"></a><tt class="py-lineno">198</tt> <tt class="py-line"><tt class="py-docstring"> RDD of all pairs of elements C{(a, b)} where C{a} is in C{self} and</tt> </tt>
+<a name="L199"></a><tt class="py-lineno">199</tt> <tt class="py-line"><tt class="py-docstring"> C{b} is in C{other}.</tt> </tt>
+<a name="L200"></a><tt class="py-lineno">200</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L201"></a><tt class="py-lineno">201</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 2])</tt> </tt>
+<a name="L202"></a><tt class="py-lineno">202</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.cartesian(rdd).collect())</tt> </tt>
+<a name="L203"></a><tt class="py-lineno">203</tt> <tt class="py-line"><tt class="py-docstring"> [(1, 1), (1, 2), (2, 1), (2, 2)]</tt> </tt>
+<a name="L204"></a><tt class="py-lineno">204</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L205"></a><tt class="py-lineno">205</tt> <tt class="py-line"> <tt class="py-comment"># Due to batching, we can't use the Java cartesian method.</tt> </tt>
+<a name="L206"></a><tt class="py-lineno">206</tt> <tt class="py-line"> <tt class="py-name">java_cartesian</tt> <tt class="py-op">=</tt> <tt id="link-23" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-23', 'RDD', 'link-18');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-24" class="py-name" targets="Method pyspark.rdd.RDD.cartesian()=pyspark.rdd.RDD-class.html#cartesian"><a title="pyspark.rdd.RDD.cartesian" class="py-name" href="#" onclick="return doclink('link-24', 'cartesian', 'link-24');">cartesian</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">)</tt> </tt>
+<a name="L207"></a><tt class="py-lineno">207</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">unpack_batches</tt><tt class="py-op">(</tt><tt class="py-param">pair</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L208"></a><tt class="py-lineno">208</tt> <tt class="py-line"> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">y</tt><tt class="py-op">)</tt> <tt class="py-op">=</tt> <tt class="py-name">pair</tt> </tt>
+<a name="L209"></a><tt class="py-lineno">209</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-name">Batch</tt> <tt class="py-keyword">or</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">y</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-name">Batch</tt><tt class="py-op">:</tt> </tt>
+<a name="L210"></a><tt class="py-lineno">210</tt> <tt class="py-line"> <tt class="py-name">xs</tt> <tt class="py-op">=</tt> <tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">items</tt> <tt class="py-keyword">if</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-name">Batch</tt> <tt class="py-keyword">else</tt> <tt class="py-op">[</tt><tt class="py-name">x</tt><tt class="py-op">]</tt> </tt>
+<a name="L211"></a><tt class="py-lineno">211</tt> <tt class="py-line"> <tt class="py-name">ys</tt> <tt class="py-op">=</tt> <tt class="py-name">y</tt><tt class="py-op">.</tt><tt class="py-name">items</tt> <tt class="py-keyword">if</tt> <tt class="py-name">type</tt><tt class="py-op">(</tt><tt class="py-name">y</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-name">Batch</tt> <tt class="py-keyword">else</tt> <tt class="py-op">[</tt><tt class="py-name">y</tt><tt class="py-op">]</tt> </tt>
+<a name="L212"></a><tt class="py-lineno">212</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">pair</tt> <tt class="py-keyword">in</tt> <tt class="py-name">product</tt><tt class="py-op">(</tt><tt class="py-name">xs</tt><tt class="py-op">,</tt> <tt class="py-name">ys</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L213"></a><tt class="py-lineno">213</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">pair</tt> </tt>
+<a name="L214"></a><tt class="py-lineno">214</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
+<a name="L215"></a><tt class="py-lineno">215</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">pair</tt> </tt>
+</div><a name="L216"></a><tt class="py-lineno">216</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">java_cartesian</tt><tt class="py-op">.</tt><tt id="link-25" class="py-name" targets="Method pyspark.rdd.RDD.flatMap()=pyspark.rdd.RDD-class.html#flatMap"><a title="pyspark.rdd.RDD.flatMap" class="py-name" href="#" onclick="return doclink('link-25', 'flatMap', 'link-25');">flatMap</a></tt><tt class="py-op">(</tt><tt class="py-name">unpack_batches</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L217"></a><tt class="py-lineno">217</tt> <tt class="py-line"> </tt>
+<a name="RDD.groupBy"></a><div id="RDD.groupBy-def"><a name="L218"></a><tt class="py-lineno">218</tt> <a class="py-toggle" href="#" id="RDD.groupBy-toggle" onclick="return toggle('RDD.groupBy');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#groupBy">groupBy</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.groupBy-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.groupBy-expanded"><a name="L219"></a><tt class="py-lineno">219</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L220"></a><tt class="py-lineno">220</tt> <tt class="py-line"><tt class="py-docstring"> Return an RDD of grouped items.</tt> </tt>
+<a name="L221"></a><tt class="py-lineno">221</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L222"></a><tt class="py-lineno">222</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 1, 2, 3, 5, 8])</tt> </tt>
+<a name="L223"></a><tt class="py-lineno">223</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; result = rdd.groupBy(lambda x: x % 2).collect()</tt> </tt>
+<a name="L224"></a><tt class="py-lineno">224</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted([(x, sorted(y)) for (x, y) in result])</tt> </tt>
+<a name="L225"></a><tt class="py-lineno">225</tt> <tt class="py-line"><tt class="py-docstring"> [(0, [2, 8]), (1, [1, 1, 3, 5])]</tt> </tt>
+<a name="L226"></a><tt class="py-lineno">226</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L227"></a><tt class="py-lineno">227</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-26" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-26', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-27" class="py-name" targets="Method pyspark.rdd.RDD.groupByKey()=pyspark.rdd.RDD-class.html#groupByKey"><a title="pyspark.rdd.RDD.groupByKey" class="py-name" href="#" onclick="return doclink('link-27', 'groupByKey', 'link-27');">groupByKey</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L228"></a><tt class="py-lineno">228</tt> <tt class="py-line"> </tt>
+<a name="RDD.pipe"></a><div id="RDD.pipe-def"><a name="L229"></a><tt class="py-lineno">229</tt> <a class="py-toggle" href="#" id="RDD.pipe-toggle" onclick="return toggle('RDD.pipe');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#pipe">pipe</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">command</tt><tt class="py-op">,</tt> <tt class="py-param">env</tt><tt class="py-op">=</tt><tt class="py-op">{</tt><tt class="py-op">}</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.pipe-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.pipe-expanded"><a name="L230"></a><tt class="py-lineno">230</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L231"></a><tt class="py-lineno">231</tt> <tt class="py-line"><tt class="py-docstring"> Return an RDD created by piping elements to a forked external process.</tt> </tt>
+<a name="L232"></a><tt class="py-lineno">232</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L233"></a><tt class="py-lineno">233</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([1, 2, 3]).pipe('cat').collect()</tt> </tt>
+<a name="L234"></a><tt class="py-lineno">234</tt> <tt class="py-line"><tt class="py-docstring"> ['1', '2', '3']</tt> </tt>
+<a name="L235"></a><tt class="py-lineno">235</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L236"></a><tt class="py-lineno">236</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L237"></a><tt class="py-lineno">237</tt> <tt class="py-line"> <tt id="link-28" class="py-name" targets="Method pyspark.rdd.RDD.pipe()=pyspark.rdd.RDD-class.html#pipe"><a title="pyspark.rdd.RDD.pipe" class="py-name" href="#" onclick="return doclink('link-28', 'pipe', 'link-28');">pipe</a></tt> <tt class="py-op">=</tt> <tt class="py-name">Popen</tt><tt class="py-op">(</tt><tt class="py-name">shlex</tt><tt class="py-op">.</tt><tt class="py-name">split</tt><tt class="py-op">(</tt><tt class="py-name">command</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">env</tt><tt class="py-op">=</tt><tt class="py-name">env</tt><tt class="py-op">,</tt> <tt class="py-name">stdin</tt><tt class="py-op">=</tt><tt class="py-name">PIPE</tt><tt class="py-op">,</tt> <tt class="py-name">stdout</tt><tt class="py-op">=</tt><tt class="py-name">PIPE</tt><tt class="py-op">)</tt> </tt>
+<a name="L238"></a><tt class="py-lineno">238</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">pipe_objs</tt><tt class="py-op">(</tt><tt class="py-param">out</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L239"></a><tt class="py-lineno">239</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L240"></a><tt class="py-lineno">240</tt> <tt class="py-line"> <tt class="py-name">out</tt><tt class="py-op">.</tt><tt class="py-name">write</tt><tt class="py-op">(</tt><tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">obj</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">rstrip</tt><tt class="py-op">(</tt><tt class="py-string">'\n'</tt><tt class="py-op">)</tt> <tt class="py-op">+</tt> <tt class="py-string">'\n'</tt><tt class="py-op">)</tt> </tt>
+<a name="L241"></a><tt class="py-lineno">241</tt> <tt class="py-line"> <tt class="py-name">out</tt><tt class="py-op">.</tt><tt class="py-name">close</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L242"></a><tt class="py-lineno">242</tt> <tt class="py-line"> <tt class="py-name">Thread</tt><tt class="py-op">(</tt><tt class="py-name">target</tt><tt class="py-op">=</tt><tt class="py-name">pipe_objs</tt><tt class="py-op">,</tt> <tt class="py-name">args</tt><tt class="py-op">=</tt><tt class="py-op">[</tt><tt id="link-29" class="py-name"><a title="pyspark.rdd.RDD.pipe" class="py-name" href="#" onclick="return doclink('link-29', 'pipe', 'link-28');">pipe</a></tt><tt class="py-op">.</tt><tt class="py-name">stdin</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">start</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L243"></a><tt class="py-lineno">243</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">rstrip</tt><tt class="py-op">(</tt><tt class="py-string">'\n'</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt id="link-30" class="py-name"><a title="pyspark.rdd.RDD.pipe" class="py-name" href="#" onclick="return doclink('link-30', 'pipe', 'link-28');">pipe</a></tt><tt class="py-op">.</tt><tt class="py-name">stdout</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L244"></a><tt class="py-lineno">244</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-31" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-31', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L245"></a><tt class="py-lineno">245</tt> <tt class="py-line"> </tt>
+<a name="RDD.foreach"></a><div id="RDD.foreach-def"><a name="L246"></a><tt class="py-lineno">246</tt> <a class="py-toggle" href="#" id="RDD.foreach-toggle" onclick="return toggle('RDD.foreach');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#foreach">foreach</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.foreach-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.foreach-expanded"><a name="L247"></a><tt class="py-lineno">247</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L248"></a><tt class="py-lineno">248</tt> <tt class="py-line"><tt class="py-docstring"> Applies a function to all elements of this RDD.</tt> </tt>
+<a name="L249"></a><tt class="py-lineno">249</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L250"></a><tt class="py-lineno">250</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; def f(x): print x</tt> </tt>
+<a name="L251"></a><tt class="py-lineno">251</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5]).foreach(f)</tt> </tt>
+<a name="L252"></a><tt class="py-lineno">252</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L253"></a><tt class="py-lineno">253</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-32" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-32', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-33" class="py-name" targets="Method pyspark.rdd.RDD.collect()=pyspark.rdd.RDD-class.html#collect"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-33', 'collect', 'link-33');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> <tt class="py-comment"># Force evaluation</tt> </tt>
+</div><a name="L254"></a><tt class="py-lineno">254</tt> <tt class="py-line"> </tt>
+<a name="RDD.collect"></a><div id="RDD.collect-def"><a name="L255"></a><tt class="py-lineno">255</tt> <a class="py-toggle" href="#" id="RDD.collect-toggle" onclick="return toggle('RDD.collect');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#collect">collect</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.collect-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.collect-expanded"><a name="L256"></a><tt class="py-lineno">256</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L257"></a><tt class="py-lineno">257</tt> <tt class="py-line"><tt class="py-docstring"> Return a list that contains all of the elements in this RDD.</tt> </tt>
+<a name="L258"></a><tt class="py-lineno">258</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L259"></a><tt class="py-lineno">259</tt> <tt class="py-line"> <tt class="py-name">picklesInJava</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-34" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-34', 'collect', 'link-33');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">iterator</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L260"></a><tt class="py-lineno">260</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">list</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_collect_iterator_through_file</tt><tt class="py-op">(</tt><tt class="py-name">picklesInJava</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L261"></a><tt class="py-lineno">261</tt> <tt class="py-line"> </tt>
+<a name="RDD._collect_iterator_through_file"></a><div id="RDD._collect_iterator_through_file-def"><a name="L262"></a><tt class="py-lineno">262</tt> <a class="py-toggle" href="#" id="RDD._collect_iterator_through_file-toggle" onclick="return toggle('RDD._collect_iterator_through_file');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#_collect_iterator_through_file">_collect_iterator_through_file</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD._collect_iterator_through_file-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD._collect_iterator_through_file-expanded"><a name="L263"></a><tt class="py-lineno">263</tt> <tt class="py-line"> <tt class="py-comment"># Transferring lots of data through Py4J can be slow because</tt> </tt>
+<a name="L264"></a><tt class="py-lineno">264</tt> <tt class="py-line"> <tt class="py-comment"># socket.readline() is inefficient. Instead, we'll dump the data to a</tt> </tt>
+<a name="L265"></a><tt class="py-lineno">265</tt> <tt class="py-line"> <tt class="py-comment"># file and read it back.</tt> </tt>
+<a name="L266"></a><tt class="py-lineno">266</tt> <tt class="py-line"> <tt class="py-name">tempFile</tt> <tt class="py-op">=</tt> <tt class="py-name">NamedTemporaryFile</tt><tt class="py-op">(</tt><tt class="py-name">delete</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">,</tt> <tt class="py-name">dir</tt><tt class="py-op">=</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_temp_dir</tt><tt class="py-op">)</tt> </tt>
+<a name="L267"></a><tt class="py-lineno">267</tt> <tt class="py-line"> <tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">close</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L268"></a><tt class="py-lineno">268</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-35" class="py-name" targets="Variable pyspark.context.SparkContext._writeIteratorToPickleFile=pyspark.context.SparkContext-class.html#_writeIteratorToPickleFile"><a title="pyspark.context.SparkContext._writeIteratorToPickleFile" class="py-name" href="#" onclick="return doclink('link-35', '_writeIteratorToPickleFile', 'link-35');">_writeIteratorToPickleFile</a></tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">,</tt> <tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">name</tt><tt class="py-op">)</tt> </tt>
+<a name="L269"></a><tt class="py-lineno">269</tt> <tt class="py-line"> <tt class="py-comment"># Read the data into Python and deserialize it:</tt> </tt>
+<a name="L270"></a><tt class="py-lineno">270</tt> <tt class="py-line"> <tt class="py-keyword">with</tt> <tt class="py-name">open</tt><tt class="py-op">(</tt><tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">name</tt><tt class="py-op">,</tt> <tt class="py-string">'rb'</tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">tempFile</tt><tt class="py-op">:</tt> </tt>
+<a name="L271"></a><tt class="py-lineno">271</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">item</tt> <tt class="py-keyword">in</tt> <tt class="py-name">read_from_pickle_file</tt><tt class="py-op">(</tt><tt class="py-name">tempFile</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L272"></a><tt class="py-lineno">272</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">item</tt> </tt>
+<a name="L273"></a><tt class="py-lineno">273</tt> <tt class="py-line"> <tt class="py-name">os</tt><tt class="py-op">.</tt><tt class="py-name">unlink</tt><tt class="py-op">(</tt><tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">name</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L274"></a><tt class="py-lineno">274</tt> <tt class="py-line"> </tt>
+<a name="RDD.reduce"></a><div id="RDD.reduce-def"><a name="L275"></a><tt class="py-lineno">275</tt> <a class="py-toggle" href="#" id="RDD.reduce-toggle" onclick="return toggle('RDD.reduce');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#reduce">reduce</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.reduce-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.reduce-expanded"><a name="L276"></a><tt class="py-lineno">276</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L277"></a><tt class="py-lineno">277</tt> <tt class="py-line"><tt class="py-docstring"> Reduces the elements of this RDD using the specified commutative and</tt> </tt>
+<a name="L278"></a><tt class="py-lineno">278</tt> <tt class="py-line"><tt class="py-docstring"> associative binary operator.</tt> </tt>
+<a name="L279"></a><tt class="py-lineno">279</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L280"></a><tt class="py-lineno">280</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from operator import add</tt> </tt>
+<a name="L281"></a><tt class="py-lineno">281</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5]).reduce(add)</tt> </tt>
+<a name="L282"></a><tt class="py-lineno">282</tt> <tt class="py-line"><tt class="py-docstring"> 15</tt> </tt>
+<a name="L283"></a><tt class="py-lineno">283</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add)</tt> </tt>
+<a name="L284"></a><tt class="py-lineno">284</tt> <tt class="py-line"><tt class="py-docstring"> 10</tt> </tt>
+<a name="L285"></a><tt class="py-lineno">285</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L286"></a><tt class="py-lineno">286</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L287"></a><tt class="py-lineno">287</tt> <tt class="py-line"> <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt> </tt>
+<a name="L288"></a><tt class="py-lineno">288</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L289"></a><tt class="py-lineno">289</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">acc</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
+<a name="L290"></a><tt class="py-lineno">290</tt> <tt class="py-line"> <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">obj</tt> </tt>
+<a name="L291"></a><tt class="py-lineno">291</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
+<a name="L292"></a><tt class="py-lineno">292</tt> <tt class="py-line"> <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">obj</tt><tt class="py-op">,</tt> <tt class="py-name">acc</tt><tt class="py-op">)</tt> </tt>
+<a name="L293"></a><tt class="py-lineno">293</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">acc</tt> <tt class="py-keyword">is</tt> <tt class="py-keyword">not</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
+<a name="L294"></a><tt class="py-lineno">294</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">acc</tt> </tt>
+</div><a name="L295"></a><tt class="py-lineno">295</tt> <tt class="py-line"> <tt class="py-name">vals</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-36" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-36', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-37" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-37', 'collect', 'link-33');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L296"></a><tt class="py-lineno">296</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-38" class="py-name" targets="Method pyspark.rdd.RDD.reduce()=pyspark.rdd.RDD-class.html#reduce"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-38', 'reduce', 'link-38');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">vals</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L297"></a><tt class="py-lineno">297</tt> <tt class="py-line"> </tt>
+<a name="RDD.fold"></a><div id="RDD.fold-def"><a name="L298"></a><tt class="py-lineno">298</tt> <a class="py-toggle" href="#" id="RDD.fold-toggle" onclick="return toggle('RDD.fold');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#fold">fold</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">zeroValue</tt><tt class="py-op">,</tt> <tt class="py-param">op</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.fold-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.fold-expanded"><a name="L299"></a><tt class="py-lineno">299</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L300"></a><tt class="py-lineno">300</tt> <tt class="py-line"><tt class="py-docstring"> Aggregate the elements of each partition, and then the results for all</tt> </tt>
+<a name="L301"></a><tt class="py-lineno">301</tt> <tt class="py-line"><tt class="py-docstring"> the partitions, using a given associative function and a neutral "zero</tt> </tt>
+<a name="L302"></a><tt class="py-lineno">302</tt> <tt class="py-line"><tt class="py-docstring"> value."</tt> </tt>
+<a name="L303"></a><tt class="py-lineno">303</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L304"></a><tt class="py-lineno">304</tt> <tt class="py-line"><tt class="py-docstring"> The function C{op(t1, t2)} is allowed to modify C{t1} and return it</tt> </tt>
+<a name="L305"></a><tt class="py-lineno">305</tt> <tt class="py-line"><tt class="py-docstring"> as its result value to avoid object allocation; however, it should not</tt> </tt>
+<a name="L306"></a><tt class="py-lineno">306</tt> <tt class="py-line"><tt class="py-docstring"> modify C{t2}.</tt> </tt>
+<a name="L307"></a><tt class="py-lineno">307</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L308"></a><tt class="py-lineno">308</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from operator import add</tt> </tt>
+<a name="L309"></a><tt class="py-lineno">309</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)</tt> </tt>
+<a name="L310"></a><tt class="py-lineno">310</tt> <tt class="py-line"><tt class="py-docstring"> 15</tt> </tt>
+<a name="L311"></a><tt class="py-lineno">311</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L312"></a><tt class="py-lineno">312</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L313"></a><tt class="py-lineno">313</tt> <tt class="py-line"> <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">zeroValue</tt> </tt>
+<a name="L314"></a><tt class="py-lineno">314</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L315"></a><tt class="py-lineno">315</tt> <tt class="py-line"> <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">op</tt><tt class="py-op">(</tt><tt class="py-name">obj</tt><tt class="py-op">,</tt> <tt class="py-name">acc</tt><tt class="py-op">)</tt> </tt>
+<a name="L316"></a><tt class="py-lineno">316</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">acc</tt> </tt>
+</div><a name="L317"></a><tt class="py-lineno">317</tt> <tt class="py-line"> <tt class="py-name">vals</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-39" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-39', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-40" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-40', 'collect', 'link-33');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L318"></a><tt class="py-lineno">318</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-41" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-41', 'reduce', 'link-38');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">op</tt><tt class="py-op">,</tt> <tt class="py-name">vals</tt><tt class="py-op">,</tt> <tt class="py-name">zeroValue</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L319"></a><tt class="py-lineno">319</tt> <tt class="py-line"> </tt>
+<a name="L320"></a><tt class="py-lineno">320</tt> <tt class="py-line"> <tt class="py-comment"># TODO: aggregate</tt> </tt>
+<a name="L321"></a><tt class="py-lineno">321</tt> <tt class="py-line"> </tt>
+<a name="RDD.sum"></a><div id="RDD.sum-def"><a name="L322"></a><tt class="py-lineno">322</tt> <a class="py-toggle" href="#" id="RDD.sum-toggle" onclick="return toggle('RDD.sum');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#sum">sum</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.sum-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.sum-expanded"><a name="L323"></a><tt class="py-lineno">323</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L324"></a><tt class="py-lineno">324</tt> <tt class="py-line"><tt class="py-docstring"> Add up the elements in this RDD.</tt> </tt>
+<a name="L325"></a><tt class="py-lineno">325</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L326"></a><tt class="py-lineno">326</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([1.0, 2.0, 3.0]).sum()</tt> </tt>
+<a name="L327"></a><tt class="py-lineno">327</tt> <tt class="py-line"><tt class="py-docstring"> 6.0</tt> </tt>
+<a name="L328"></a><tt class="py-lineno">328</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L329"></a><tt class="py-lineno">329</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-42" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-42', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">[</tt><tt id="link-43" class="py-name" targets="Method pyspark.rdd.RDD.sum()=pyspark.rdd.RDD-class.html#sum"><a title="pyspark.rdd.RDD.sum" class="py-name" href="#" onclick="return doclink('link-43', 'sum', 'link-43');">sum</a></tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-44" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-44', 'reduce', 'link-38');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">operator</tt><tt class="py-op">.</tt><tt class="py-name">add</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L330"></a><tt class="py-lineno">330</tt> <tt class="py-line"> </tt>
+<a name="RDD.count"></a><div id="RDD.count-def"><a name="L331"></a><tt class="py-lineno">331</tt> <a class="py-toggle" href="#" id="RDD.count-toggle" onclick="return toggle('RDD.count');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#count">count</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.count-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.count-expanded"><a name="L332"></a><tt class="py-lineno">332</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L333"></a><tt class="py-lineno">333</tt> <tt class="py-line"><tt class="py-docstring"> Return the number of elements in this RDD.</tt> </tt>
+<a name="L334"></a><tt class="py-lineno">334</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L335"></a><tt class="py-lineno">335</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([2, 3, 4]).count()</tt> </tt>
+<a name="L336"></a><tt class="py-lineno">336</tt> <tt class="py-line"><tt class="py-docstring"> 3</tt> </tt>
+<a name="L337"></a><tt class="py-lineno">337</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L338"></a><tt class="py-lineno">338</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-45" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-45', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">i</tt><tt class="py-op">:</tt> <tt class="py-op">[</tt><tt id="link-46" class="py-name"><a title="pyspark.rdd.RDD.sum" class="py-name" href="#" onclick="return doclink('link-46', 'sum', 'link-43');">sum</a></tt><tt class="py-op">(</tt><tt class="py-number">1</tt> <tt class="py-keyword">for</tt> <tt class="py-name">_</tt> <tt class="py-keyword">in</tt> <tt class="py-name">i</tt><tt class="py-op">)</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-47" class="py-name"><a title="pyspark.rdd.RDD.sum" class="py-name" href="#" onclick="return doclink('link-47', 'sum', 'link-43');">sum</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L339"></a><tt class="py-lineno">339</tt> <tt class="py-line"> </tt>
+<a name="RDD.countByValue"></a><div id="RDD.countByValue-def"><a name="L340"></a><tt class="py-lineno">340</tt> <a class="py-toggle" href="#" id="RDD.countByValue-toggle" onclick="return toggle('RDD.countByValue');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#countByValue">countByValue</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.countByValue-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.countByValue-expanded"><a name="L341"></a><tt class="py-lineno">341</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L342"></a><tt class="py-lineno">342</tt> <tt class="py-line"><tt class="py-docstring"> Return the count of each unique value in this RDD as a dictionary of</tt> </tt>
+<a name="L343"></a><tt class="py-lineno">343</tt> <tt class="py-line"><tt class="py-docstring"> (value, count) pairs.</tt> </tt>
+<a name="L344"></a><tt class="py-lineno">344</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L345"></a><tt class="py-lineno">345</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())</tt> </tt>
+<a name="L346"></a><tt class="py-lineno">346</tt> <tt class="py-line"><tt class="py-docstring"> [(1, 2), (2, 3)]</tt> </tt>
+<a name="L347"></a><tt class="py-lineno">347</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L348"></a><tt class="py-lineno">348</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">countPartition</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L349"></a><tt class="py-lineno">349</tt> <tt class="py-line"> <tt class="py-name">counts</tt> <tt class="py-op">=</tt> <tt class="py-name">defaultdict</tt><tt class="py-op">(</tt><tt class="py-name">int</tt><tt class="py-op">)</tt> </tt>
+<a name="L350"></a><tt class="py-lineno">350</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L351"></a><tt class="py-lineno">351</tt> <tt class="py-line"> <tt class="py-name">counts</tt><tt class="py-op">[</tt><tt class="py-name">obj</tt><tt class="py-op">]</tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt>
+<a name="L352"></a><tt class="py-lineno">352</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">counts</tt> </tt>
+</div><a name="L353"></a><tt class="py-lineno">353</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeMaps</tt><tt class="py-op">(</tt><tt class="py-param">m1</tt><tt class="py-op">,</tt> <tt class="py-param">m2</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L354"></a><tt class="py-lineno">354</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m2</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L355"></a><tt class="py-lineno">355</tt> <tt class="py-line"> <tt class="py-name">m1</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">+=</tt> <tt class="py-name">v</tt> </tt>
+<a name="L356"></a><tt class="py-lineno">356</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">m1</tt> </tt>
+</div><a name="L357"></a><tt class="py-lineno">357</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-48" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-48', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">countPartition</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-49" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-49', 'reduce', 'link-38');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">mergeMaps</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L358"></a><tt class="py-lineno">358</tt> <tt class="py-line"> </tt>
+<a name="RDD.take"></a><div id="RDD.take-def"><a name="L359"></a><tt class="py-lineno">359</tt> <a class="py-toggle" href="#" id="RDD.take-toggle" onclick="return toggle('RDD.take');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#take">take</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">num</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.take-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.take-expanded"><a name="L360"></a><tt class="py-lineno">360</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L361"></a><tt class="py-lineno">361</tt> <tt class="py-line"><tt class="py-docstring"> Take the first num elements of the RDD.</tt> </tt>
+<a name="L362"></a><tt class="py-lineno">362</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L363"></a><tt class="py-lineno">363</tt> <tt class="py-line"><tt class="py-docstring"> This currently scans the partitions *one by one*, so it will be slow if</tt> </tt>
+<a name="L364"></a><tt class="py-lineno">364</tt> <tt class="py-line"><tt class="py-docstring"> a lot of partitions are required. In that case, use L{collect} to get</tt> </tt>
+<a name="L365"></a><tt class="py-lineno">365</tt> <tt class="py-line"><tt class="py-docstring"> the whole RDD instead.</tt> </tt>
+<a name="L366"></a><tt class="py-lineno">366</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L367"></a><tt class="py-lineno">367</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)</tt> </tt>
+<a name="L368"></a><tt class="py-lineno">368</tt> <tt class="py-line"><tt class="py-docstring"> [2, 3]</tt> </tt>
+<a name="L369"></a><tt class="py-lineno">369</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([2, 3, 4, 5, 6]).take(10)</tt> </tt>
+<a name="L370"></a><tt class="py-lineno">370</tt> <tt class="py-line"><tt class="py-docstring"> [2, 3, 4, 5, 6]</tt> </tt>
+<a name="L371"></a><tt class="py-lineno">371</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L372"></a><tt class="py-lineno">372</tt> <tt class="py-line"> <tt class="py-name">items</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-op">]</tt> </tt>
+<a name="L373"></a><tt class="py-lineno">373</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-name">partition</tt> <tt class="py-keyword">in</tt> <tt class="py-name">range</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt class="py-name">splits</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">size</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L374"></a><tt class="py-lineno">374</tt> <tt class="py-line"> <tt class="py-name">iterator</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-50" class="py-name" targets="Variable pyspark.context.SparkContext._takePartition=pyspark.context.SparkContext-class.html#_takePartition"><a title="pyspark.context.SparkContext._takePartition" class="py-name" href="#" onclick="return doclink('link-50', '_takePartition', 'link-50');">_takePartition</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-51" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-51', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">partition</tt><tt class="py-op">)</tt> </tt>
+<a name="L375"></a><tt class="py-lineno">375</tt> <tt class="py-line"> <tt class="py-comment"># Each item in the iterator is a string, Python object, batch of</tt> </tt>
+<a name="L376"></a><tt class="py-lineno">376</tt> <tt class="py-line"> <tt class="py-comment"># Python objects. Regardless, it is sufficient to take `num`</tt> </tt>
+<a name="L377"></a><tt class="py-lineno">377</tt> <tt class="py-line"> <tt class="py-comment"># of these objects in order to collect `num` Python objects:</tt> </tt>
+<a name="L378"></a><tt class="py-lineno">378</tt> <tt class="py-line"> <tt class="py-name">iterator</tt> <tt class="py-op">=</tt> <tt class="py-name">iterator</tt><tt class="py-op">.</tt><tt id="link-52" class="py-name" targets="Method pyspark.rdd.RDD.take()=pyspark.rdd.RDD-class.html#take"><a title="pyspark.rdd.RDD.take" class="py-name" href="#" onclick="return doclink('link-52', 'take', 'link-52');">take</a></tt><tt class="py-op">(</tt><tt class="py-name">num</tt><tt class="py-op">)</tt> </tt>
+<a name="L379"></a><tt class="py-lineno">379</tt> <tt class="py-line"> <tt class="py-name">items</tt><tt class="py-op">.</tt><tt class="py-name">extend</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_collect_iterator_through_file</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+<a name="L380"></a><tt class="py-lineno">380</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;=</tt> <tt class="py-name">num</tt><tt class="py-op">:</tt> </tt>
+<a name="L381"></a><tt class="py-lineno">381</tt> <tt class="py-line"> <tt class="py-keyword">break</tt> </tt>
+<a name="L382"></a><tt class="py-lineno">382</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">items</tt><tt class="py-op">[</tt><tt class="py-op">:</tt><tt class="py-name">num</tt><tt class="py-op">]</tt> </tt>
+</div><a name="L383"></a><tt class="py-lineno">383</tt> <tt class="py-line"> </tt>
+<a name="RDD.first"></a><div id="RDD.first-def"><a name="L384"></a><tt class="py-lineno">384</tt> <a class="py-toggle" href="#" id="RDD.first-toggle" onclick="return toggle('RDD.first');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#first">first</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.first-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.first-expanded"><a name="L385"></a><tt class="py-lineno">385</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L386"></a><tt class="py-lineno">386</tt> <tt class="py-line"><tt class="py-docstring"> Return the first element in this RDD.</tt> </tt>
+<a name="L387"></a><tt class="py-lineno">387</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L388"></a><tt class="py-lineno">388</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize([2, 3, 4]).first()</tt> </tt>
+<a name="L389"></a><tt class="py-lineno">389</tt> <tt class="py-line"><tt class="py-docstring"> 2</tt> </tt>
+<a name="L390"></a><tt class="py-lineno">390</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L391"></a><tt class="py-lineno">391</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-53" class="py-name"><a title="pyspark.rdd.RDD.take" class="py-name" href="#" onclick="return doclink('link-53', 'take', 'link-52');">take</a></tt><tt class="py-op">(</tt><tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt> </tt>
+</div><a name="L392"></a><tt class="py-lineno">392</tt> <tt class="py-line"> </tt>
+<a name="RDD.saveAsTextFile"></a><div id="RDD.saveAsTextFile-def"><a name="L393"></a><tt class="py-lineno">393</tt> <a class="py-toggle" href="#" id="RDD.saveAsTextFile-toggle" onclick="return toggle('RDD.saveAsTextFile');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#saveAsTextFile">saveAsTextFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">path</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.saveAsTextFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.saveAsTextFile-expanded"><a name="L394"></a><tt class="py-lineno">394</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L395"></a><tt class="py-lineno">395</tt> <tt class="py-line"><tt class="py-docstring"> Save this RDD as a text file, using string representations of elements.</tt> </tt>
+<a name="L396"></a><tt class="py-lineno">396</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L397"></a><tt class="py-lineno">397</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; tempFile = NamedTemporaryFile(delete=True)</tt> </tt>
+<a name="L398"></a><tt class="py-lineno">398</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; tempFile.close()</tt> </tt>
+<a name="L399"></a><tt class="py-lineno">399</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sc.parallelize(range(10)).saveAsTextFile(tempFile.name)</tt> </tt>
+<a name="L400"></a><tt class="py-lineno">400</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from fileinput import input</tt> </tt>
+<a name="L401"></a><tt class="py-lineno">401</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from glob import glob</tt> </tt>
+<a name="L402"></a><tt class="py-lineno">402</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; ''.join(input(glob(tempFile.name + "/part-0000*")))</tt> </tt>
+<a name="L403"></a><tt class="py-lineno">403</tt> <tt class="py-line"><tt class="py-docstring"> '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'</tt> </tt>
+<a name="L404"></a><tt class="py-lineno">404</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L405"></a><tt class="py-lineno">405</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L406"></a><tt class="py-lineno">406</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-op">(</tt><tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">encode</tt><tt class="py-op">(</tt><tt class="py-string">"utf-8"</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L407"></a><tt class="py-lineno">407</tt> <tt class="py-line"> <tt class="py-name">keyed</tt> <tt class="py-op">=</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
+<a name="L408"></a><tt class="py-lineno">408</tt> <tt class="py-line"> <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
+<a name="L409"></a><tt class="py-lineno">409</tt> <tt class="py-line"> <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-54" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-54', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-55" class="py-name" targets="Variable pyspark.context.SparkContext._jvm=pyspark.context.SparkContext-class.html#_jvm"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-55', '_jvm', 'link-55');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">BytesToString</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-56" class="py-name" targets="Method pyspark.rdd.RDD.saveAsTextFile()=pyspark.rdd.RDD-class.html#saveAsTextFile"><a title="pyspark.rdd.RDD.saveAsTextFile" class="py-name" href="#" onclick="return doclink('link-56', 'saveAsTextFile', 'link-56');">saveAsTextFile</a></tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L410"></a><tt class="py-lineno">410</tt> <tt class="py-line"> </tt>
+<a name="L411"></a><tt class="py-lineno">411</tt> <tt class="py-line"> <tt class="py-comment"># Pair functions</tt> </tt>
+<a name="L412"></a><tt class="py-lineno">412</tt> <tt class="py-line"> </tt>
+<a name="RDD.collectAsMap"></a><div id="RDD.collectAsMap-def"><a name="L413"></a><tt class="py-lineno">413</tt> <a class="py-toggle" href="#" id="RDD.collectAsMap-toggle" onclick="return toggle('RDD.collectAsMap');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#collectAsMap">collectAsMap</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.collectAsMap-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.collectAsMap-expanded"><a name="L414"></a><tt class="py-lineno">414</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L415"></a><tt class="py-lineno">415</tt> <tt class="py-line"><tt class="py-docstring"> Return the key-value pairs in this RDD to the master as a dictionary.</tt> </tt>
+<a name="L416"></a><tt class="py-lineno">416</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L417"></a><tt class="py-lineno">417</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()</tt> </tt>
+<a name="L418"></a><tt class="py-lineno">418</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; m[1]</tt> </tt>
+<a name="L419"></a><tt class="py-lineno">419</tt> <tt class="py-line"><tt class="py-docstring"> 2</tt> </tt>
+<a name="L420"></a><tt class="py-lineno">420</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; m[3]</tt> </tt>
+<a name="L421"></a><tt class="py-lineno">421</tt> <tt class="py-line"><tt class="py-docstring"> 4</tt> </tt>
+<a name="L422"></a><tt class="py-lineno">422</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L423"></a><tt class="py-lineno">423</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">dict</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-57" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-57', 'collect', 'link-33');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L424"></a><tt class="py-lineno">424</tt> <tt class="py-line"> </tt>
+<a name="RDD.reduceByKey"></a><div id="RDD.reduceByKey-def"><a name="L425"></a><tt class="py-lineno">425</tt> <a class="py-toggle" href="#" id="RDD.reduceByKey-toggle" onclick="return toggle('RDD.reduceByKey');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#reduceByKey">reduceByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">func</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.reduceByKey-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.reduceByKey-expanded"><a name="L426"></a><tt class="py-lineno">426</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L427"></a><tt class="py-lineno">427</tt> <tt class="py-line"><tt class="py-docstring"> Merge the values for each key using an associative reduce function.</tt> </tt>
+<a name="L428"></a><tt class="py-lineno">428</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L429"></a><tt class="py-lineno">429</tt> <tt class="py-line"><tt class="py-docstring"> This will also perform the merging locally on each mapper before</tt> </tt>
+<a name="L430"></a><tt class="py-lineno">430</tt> <tt class="py-line"><tt class="py-docstring"> sending results to a reducer, similarly to a "combiner" in MapReduce.</tt> </tt>
+<a name="L431"></a><tt class="py-lineno">431</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L432"></a><tt class="py-lineno">432</tt> <tt class="py-line"><tt class="py-docstring"> Output will be hash-partitioned with C{numPartitions} partitions, or</tt> </tt>
+<a name="L433"></a><tt class="py-lineno">433</tt> <tt class="py-line"><tt class="py-docstring"> the default parallelism level if C{numPartitions} is not specified.</tt> </tt>
+<a name="L434"></a><tt class="py-lineno">434</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L435"></a><tt class="py-lineno">435</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from operator import add</tt> </tt>
+<a name="L436"></a><tt class="py-lineno">436</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
+<a name="L437"></a><tt class="py-lineno">437</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.reduceByKey(add).collect())</tt> </tt>
+<a name="L438"></a><tt class="py-lineno">438</tt> <tt class="py-line"><tt class="py-docstring"> [('a', 2), ('b', 1)]</tt> </tt>
+<a name="L439"></a><tt class="py-lineno">439</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L440"></a><tt class="py-lineno">440</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-58" class="py-name" targets="Method pyspark.rdd.RDD.combineByKey()=pyspark.rdd.RDD-class.html#combineByKey"><a title="pyspark.rdd.RDD.combineByKey" class="py-name" href="#" onclick="return doclink('link-58', 'combineByKey', 'link-58');">combineByKey</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L441"></a><tt class="py-lineno">441</tt> <tt class="py-line"> </tt>
+<a name="RDD.reduceByKeyLocally"></a><div id="RDD.reduceByKeyLocally-def"><a name="L442"></a><tt class="py-lineno">442</tt> <a class="py-toggle" href="#" id="RDD.reduceByKeyLocally-toggle" onclick="return toggle('RDD.reduceByKeyLocally');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#reduceByKeyLocally">reduceByKeyLocally</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">func</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.reduceByKeyLocally-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.reduceByKeyLocally-expanded"><a name="L443"></a><tt class="py-lineno">443</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L444"></a><tt class="py-lineno">444</tt> <tt class="py-line"><tt class="py-docstring"> Merge the values for each key using an associative reduce function, but</tt> </tt>
+<a name="L445"></a><tt class="py-lineno">445</tt> <tt class="py-line"><tt class="py-docstring"> return the results immediately to the master as a dictionary.</tt> </tt>
+<a name="L446"></a><tt class="py-lineno">446</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L447"></a><tt class="py-lineno">447</tt> <tt class="py-line"><tt class="py-docstring"> This will also perform the merging locally on each mapper before</tt> </tt>
+<a name="L448"></a><tt class="py-lineno">448</tt> <tt class="py-line"><tt class="py-docstring"> sending results to a reducer, similarly to a "combiner" in MapReduce.</tt> </tt>
+<a name="L449"></a><tt class="py-lineno">449</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L450"></a><tt class="py-lineno">450</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from operator import add</tt> </tt>
+<a name="L451"></a><tt class="py-lineno">451</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
+<a name="L452"></a><tt class="py-lineno">452</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.reduceByKeyLocally(add).items())</tt> </tt>
+<a name="L453"></a><tt class="py-lineno">453</tt> <tt class="py-line"><tt class="py-docstring"> [('a', 2), ('b', 1)]</tt> </tt>
+<a name="L454"></a><tt class="py-lineno">454</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L455"></a><tt class="py-lineno">455</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">reducePartition</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L456"></a><tt class="py-lineno">456</tt> <tt class="py-line"> <tt class="py-name">m</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt>
+<a name="L457"></a><tt class="py-lineno">457</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L458"></a><tt class="py-lineno">458</tt> <tt class="py-line"> <tt class="py-name">m</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">v</tt> <tt class="py-keyword">if</tt> <tt class="py-name">k</tt> <tt class="py-keyword">not</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m</tt> <tt class="py-keyword">else</tt> <tt class="py-name">func</tt><tt class="py-op">(</tt><tt class="py-name">m</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
+<a name="L459"></a><tt class="py-lineno">459</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">m</tt> </tt>
+</div><a name="L460"></a><tt class="py-lineno">460</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeMaps</tt><tt class="py-op">(</tt><tt class="py-param">m1</tt><tt class="py-op">,</tt> <tt class="py-param">m2</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L461"></a><tt class="py-lineno">461</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m2</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L462"></a><tt class="py-lineno">462</tt> <tt class="py-line"> <tt class="py-name">m1</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">v</tt> <tt class="py-keyword">if</tt> <tt class="py-name">k</tt> <tt class="py-keyword">not</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m1</tt> <tt class="py-keyword">else</tt> <tt class="py-name">func</tt><tt class="py-op">(</tt><tt class="py-name">m1</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
+<a name="L463"></a><tt class="py-lineno">463</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">m1</tt> </tt>
+</div><a name="L464"></a><tt class="py-lineno">464</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-59" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-59', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">reducePartition</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-60" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-60', 'reduce', 'link-38');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">mergeMaps</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L465"></a><tt class="py-lineno">465</tt> <tt class="py-line"> </tt>
+<a name="RDD.countByKey"></a><div id="RDD.countByKey-def"><a name="L466"></a><tt class="py-lineno">466</tt> <a class="py-toggle" href="#" id="RDD.countByKey-toggle" onclick="return toggle('RDD.countByKey');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#countByKey">countByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.countByKey-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.countByKey-expanded"><a name="L467"></a><tt class="py-lineno">467</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L468"></a><tt class="py-lineno">468</tt> <tt class="py-line"><tt class="py-docstring"> Count the number of elements for each key, and return the result to the</tt> </tt>
+<a name="L469"></a><tt class="py-lineno">469</tt> <tt class="py-line"><tt class="py-docstring"> master as a dictionary.</tt> </tt>
+<a name="L470"></a><tt class="py-lineno">470</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L471"></a><tt class="py-lineno">471</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
+<a name="L472"></a><tt class="py-lineno">472</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(rdd.countByKey().items())</tt> </tt>
+<a name="L473"></a><tt class="py-lineno">473</tt> <tt class="py-line"><tt class="py-docstring"> [('a', 2), ('b', 1)]</tt> </tt>
+<a name="L474"></a><tt class="py-lineno">474</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L475"></a><tt class="py-lineno">475</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-61" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-61', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-62" class="py-name" targets="Method pyspark.rdd.RDD.countByValue()=pyspark.rdd.RDD-class.html#countByValue"><a title="pyspark.rdd.RDD.countByValue" class="py-name" href="#" onclick="return doclink('link-62', 'countByValue', 'link-62');">countByValue</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L476"></a><tt class="py-lineno">476</tt> <tt class="py-line"> </tt>
+<a name="RDD.join"></a><div id="RDD.join-def"><a name="L477"></a><tt class="py-lineno">477</tt> <a class="py-toggle" href="#" id="RDD.join-toggle" onclick="return toggle('RDD.join');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#join">join</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.join-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.join-expanded"><a name="L478"></a><tt class="py-lineno">478</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L479"></a><tt class="py-lineno">479</tt> <tt class="py-line"><tt class="py-docstring"> Return an RDD containing all pairs of elements with matching keys in</tt> </tt>
+<a name="L480"></a><tt class="py-lineno">480</tt> <tt class="py-line"><tt class="py-docstring"> C{self} and C{other}.</tt> </tt>
+<a name="L481"></a><tt class="py-lineno">481</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L482"></a><tt class="py-lineno">482</tt> <tt class="py-line"><tt class="py-docstring"> Each pair of elements will be returned as a (k, (v1, v2)) tuple, where</tt> </tt>
+<a name="L483"></a><tt class="py-lineno">483</tt> <tt class="py-line"><tt class="py-docstring"> (k, v1) is in C{self} and (k, v2) is in C{other}.</tt> </tt>
+<a name="L484"></a><tt class="py-lineno">484</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L485"></a><tt class="py-lineno">485</tt> <tt class="py-line"><tt class="py-docstring"> Performs a hash join across the cluster.</tt> </tt>
+<a name="L486"></a><tt class="py-lineno">486</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L487"></a><tt class="py-lineno">487</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
+<a name="L488"></a><tt class="py-lineno">488</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; y = sc.parallelize([("a", 2), ("a", 3)])</tt> </tt>
+<a name="L489"></a><tt class="py-lineno">489</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(x.join(y).collect())</tt> </tt>
+<a name="L490"></a><tt class="py-lineno">490</tt> <tt class="py-line"><tt class="py-docstring"> [('a', (1, 2)), ('a', (1, 3))]</tt> </tt>
+<a name="L491"></a><tt class="py-lineno">491</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L492"></a><tt class="py-lineno">492</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">python_join</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L493"></a><tt class="py-lineno">493</tt> <tt class="py-line"> </tt>
+<a name="RDD.leftOuterJoin"></a><div id="RDD.leftOuterJoin-def"><a name="L494"></a><tt class="py-lineno">494</tt> <a class="py-toggle" href="#" id="RDD.leftOuterJoin-toggle" onclick="return toggle('RDD.leftOuterJoin');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#leftOuterJoin">leftOuterJoin</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.leftOuterJoin-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.leftOuterJoin-expanded"><a name="L495"></a><tt class="py-lineno">495</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L496"></a><tt class="py-lineno">496</tt> <tt class="py-line"><tt class="py-docstring"> Perform a left outer join of C{self} and C{other}.</tt> </tt>
+<a name="L497"></a><tt class="py-lineno">497</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L498"></a><tt class="py-lineno">498</tt> <tt class="py-line"><tt class="py-docstring"> For each element (k, v) in C{self}, the resulting RDD will either</tt> </tt>
+<a name="L499"></a><tt class="py-lineno">499</tt> <tt class="py-line"><tt class="py-docstring"> contain all pairs (k, (v, w)) for w in C{other}, or the pair</tt> </tt>
+<a name="L500"></a><tt class="py-lineno">500</tt> <tt class="py-line"><tt class="py-docstring"> (k, (v, None)) if no elements in other have key k.</tt> </tt>
+<a name="L501"></a><tt class="py-lineno">501</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L502"></a><tt class="py-lineno">502</tt> <tt class="py-line"><tt class="py-docstring"> Hash-partitions the resulting RDD into the given number of partitions.</tt> </tt>
+<a name="L503"></a><tt class="py-lineno">503</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L504"></a><tt class="py-lineno">504</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
+<a name="L505"></a><tt class="py-lineno">505</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; y = sc.parallelize([("a", 2)])</tt> </tt>
+<a name="L506"></a><tt class="py-lineno">506</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(x.leftOuterJoin(y).collect())</tt> </tt>
+<a name="L507"></a><tt class="py-lineno">507</tt> <tt class="py-line"><tt class="py-docstring"> [('a', (1, 2)), ('b', (4, None))]</tt> </tt>
+<a name="L508"></a><tt class="py-lineno">508</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L509"></a><tt class="py-lineno">509</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">python_left_outer_join</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L510"></a><tt class="py-lineno">510</tt> <tt class="py-line"> </tt>
+<a name="RDD.rightOuterJoin"></a><div id="RDD.rightOuterJoin-def"><a name="L511"></a><tt class="py-lineno">511</tt> <a class="py-toggle" href="#" id="RDD.rightOuterJoin-toggle" onclick="return toggle('RDD.rightOuterJoin');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#rightOuterJoin">rightOuterJoin</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.rightOuterJoin-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.rightOuterJoin-expanded"><a name="L512"></a><tt class="py-lineno">512</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L513"></a><tt class="py-lineno">513</tt> <tt class="py-line"><tt class="py-docstring"> Perform a right outer join of C{self} and C{other}.</tt> </tt>
+<a name="L514"></a><tt class="py-lineno">514</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L515"></a><tt class="py-lineno">515</tt> <tt class="py-line"><tt class="py-docstring"> For each element (k, w) in C{other}, the resulting RDD will either</tt> </tt>
+<a name="L516"></a><tt class="py-lineno">516</tt> <tt class="py-line"><tt class="py-docstring"> contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w))</tt> </tt>
+<a name="L517"></a><tt class="py-lineno">517</tt> <tt class="py-line"><tt class="py-docstring"> if no elements in C{self} have key k.</tt> </tt>
+<a name="L518"></a><tt class="py-lineno">518</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L519"></a><tt class="py-lineno">519</tt> <tt class="py-line"><tt class="py-docstring"> Hash-partitions the resulting RDD into the given number of partitions.</tt> </tt>
+<a name="L520"></a><tt class="py-lineno">520</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L521"></a><tt class="py-lineno">521</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
+<a name="L522"></a><tt class="py-lineno">522</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; y = sc.parallelize([("a", 2)])</tt> </tt>
+<a name="L523"></a><tt class="py-lineno">523</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(y.rightOuterJoin(x).collect())</tt> </tt>
+<a name="L524"></a><tt class="py-lineno">524</tt> <tt class="py-line"><tt class="py-docstring"> [('a', (2, 1)), ('b', (None, 4))]</tt> </tt>
+<a name="L525"></a><tt class="py-lineno">525</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L526"></a><tt class="py-lineno">526</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">python_right_outer_join</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L527"></a><tt class="py-lineno">527</tt> <tt class="py-line"> </tt>
+<a name="L528"></a><tt class="py-lineno">528</tt> <tt class="py-line"> <tt class="py-comment"># TODO: add option to control map-side combining</tt> </tt>
+<a name="RDD.partitionBy"></a><div id="RDD.partitionBy-def"><a name="L529"></a><tt class="py-lineno">529</tt> <a class="py-toggle" href="#" id="RDD.partitionBy-toggle" onclick="return toggle('RDD.partitionBy');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#partitionBy">partitionBy</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">,</tt> <tt class="py-param">partitionFunc</tt><tt class="py-op">=</tt><tt class="py-name">hash</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.partitionBy-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.partitionBy-expanded"><a name="L530"></a><tt class="py-lineno">530</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L531"></a><tt class="py-lineno">531</tt> <tt class="py-line"><tt class="py-docstring"> Return a copy of the RDD partitioned using the specified partitioner.</tt> </tt>
+<a name="L532"></a><tt class="py-lineno">532</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L533"></a><tt class="py-lineno">533</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))</tt> </tt>
+<a name="L534"></a><tt class="py-lineno">534</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sets = pairs.partitionBy(2).glom().collect()</tt> </tt>
+<a name="L535"></a><tt class="py-lineno">535</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; set(sets[0]).intersection(set(sets[1]))</tt> </tt>
+<a name="L536"></a><tt class="py-lineno">536</tt> <tt class="py-line"><tt class="py-docstring"> set([])</tt> </tt>
+<a name="L537"></a><tt class="py-lineno">537</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L538"></a><tt class="py-lineno">538</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
+<a name="L539"></a><tt class="py-lineno">539</tt> <tt class="py-line"> <tt class="py-name">numPartitions</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-63" class="py-name" targets="Method pyspark.context.SparkContext.defaultParallelism()=pyspark.context.SparkContext-class.html#defaultParallelism"><a title="pyspark.context.SparkContext.defaultParallelism" class="py-name" href="#" onclick="return doclink('link-63', 'defaultParallelism', 'link-63');">defaultParallelism</a></tt> </tt>
+<a name="L540"></a><tt class="py-lineno">540</tt> <tt class="py-line"> <tt class="py-comment"># Transferring O(n) objects to Java is too expensive. Instead, we'll</tt> </tt>
+<a name="L541"></a><tt class="py-lineno">541</tt> <tt class="py-line"> <tt class="py-comment"># form the hash buckets in Python, transferring O(numPartitions) objects</tt> </tt>
+<a name="L542"></a><tt class="py-lineno">542</tt> <tt class="py-line"> <tt class="py-comment"># to Java. Each object is a (splitNumber, [objects]) pair.</tt> </tt>
+<a name="L543"></a><tt class="py-lineno">543</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">add_shuffle_key</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L544"></a><tt class="py-lineno">544</tt> <tt class="py-line"> <tt class="py-name">buckets</tt> <tt class="py-op">=</tt> <tt class="py-name">defaultdict</tt><tt class="py-op">(</tt><tt class="py-name">list</tt><tt class="py-op">)</tt> </tt>
+<a name="L545"></a><tt class="py-lineno">545</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L546"></a><tt class="py-lineno">546</tt> <tt class="py-line"> <tt class="py-name">buckets</tt><tt class="py-op">[</tt><tt class="py-name">partitionFunc</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">)</tt> <tt class="py-op">%</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+<a name="L547"></a><tt class="py-lineno">547</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">items</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">buckets</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L548"></a><tt class="py-lineno">548</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">)</tt> </tt>
+<a name="L549"></a><tt class="py-lineno">549</tt> <tt class="py-line"> <tt class="py-keyword">yield</tt> <tt class="py-name">dump_pickle</tt><tt class="py-op">(</tt><tt class="py-name">Batch</tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L550"></a><tt class="py-lineno">550</tt> <tt class="py-line"> <tt class="py-name">keyed</tt> <tt class="py-op">=</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">add_shuffle_key</tt><tt class="py-op">)</tt> </tt>
+<a name="L551"></a><tt class="py-lineno">551</tt> <tt class="py-line"> <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
+<a name="L552"></a><tt class="py-lineno">552</tt> <tt class="py-line"> <tt class="py-name">pairRDD</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-64" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-64', '_jvm', 'link-55');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PairwiseRDD</tt><tt class="py-op">(</tt><tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-65" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-65', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">asJavaPairRDD</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L553"></a><tt class="py-lineno">553</tt> <tt class="py-line"> <tt class="py-name">partitioner</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-66" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-66', '_jvm', 'link-55');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PythonPartitioner</tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">,</tt> </tt>
+<a name="L554"></a><tt class="py-lineno">554</tt> <tt class="py-line"> <tt class="py-name">id</tt><tt class="py-op">(</tt><tt class="py-name">partitionFunc</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+<a name="L555"></a><tt class="py-lineno">555</tt> <tt class="py-line"> <tt class="py-name">jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">pairRDD</tt><tt class="py-op">.</tt><tt id="link-67" class="py-name" targets="Method pyspark.rdd.RDD.partitionBy()=pyspark.rdd.RDD-class.html#partitionBy"><a title="pyspark.rdd.RDD.partitionBy" class="py-name" href="#" onclick="return doclink('link-67', 'partitionBy', 'link-67');">partitionBy</a></tt><tt class="py-op">(</tt><tt class="py-name">partitioner</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">values</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L556"></a><tt class="py-lineno">556</tt> <tt class="py-line"> <tt id="link-68" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-68', 'rdd', 'link-5');">rdd</a></tt> <tt class="py-op">=</tt> <tt id="link-69" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-69', 'RDD', 'link-18');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jrdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">)</tt> </tt>
+<a name="L557"></a><tt class="py-lineno">557</tt> <tt class="py-line"> <tt class="py-comment"># This is required so that id(partitionFunc) remains unique, even if</tt> </tt>
+<a name="L558"></a><tt class="py-lineno">558</tt> <tt class="py-line"> <tt class="py-comment"># partitionFunc is a lambda:</tt> </tt>
+<a name="L559"></a><tt class="py-lineno">559</tt> <tt class="py-line"> <tt id="link-70" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-70', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">.</tt><tt class="py-name">_partitionFunc</tt> <tt class="py-op">=</tt> <tt class="py-name">partitionFunc</tt> </tt>
+<a name="L560"></a><tt class="py-lineno">560</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt id="link-71" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-71', 'rdd', 'link-5');">rdd</a></tt> </tt>
+</div><a name="L561"></a><tt class="py-lineno">561</tt> <tt class="py-line"> </tt>
+<a name="L562"></a><tt class="py-lineno">562</tt> <tt class="py-line"> <tt class="py-comment"># TODO: add control over map-side aggregation</tt> </tt>
+<a name="RDD.combineByKey"></a><div id="RDD.combineByKey-def"><a name="L563"></a><tt class="py-lineno">563</tt> <a class="py-toggle" href="#" id="RDD.combineByKey-toggle" onclick="return toggle('RDD.combineByKey');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#combineByKey">combineByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">createCombiner</tt><tt class="py-op">,</tt> <tt class="py-param">mergeValue</tt><tt class="py-op">,</tt> <tt class="py-param">mergeCombiners</tt><tt class="py-op">,</tt> </tt>
+<a name="L564"></a><tt class="py-lineno">564</tt> <tt class="py-line"> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.combineByKey-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.combineByKey-expanded"><a name="L565"></a><tt class="py-lineno">565</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L566"></a><tt class="py-lineno">566</tt> <tt class="py-line"><tt class="py-docstring"> Generic function to combine the elements for each key using a custom</tt> </tt>
+<a name="L567"></a><tt class="py-lineno">567</tt> <tt class="py-line"><tt class="py-docstring"> set of aggregation functions.</tt> </tt>
+<a name="L568"></a><tt class="py-lineno">568</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L569"></a><tt class="py-lineno">569</tt> <tt class="py-line"><tt class="py-docstring"> Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined</tt> </tt>
+<a name="L570"></a><tt class="py-lineno">570</tt> <tt class="py-line"><tt class="py-docstring"> type" C. Note that V and C can be different -- for example, one might</tt> </tt>
+<a name="L571"></a><tt class="py-lineno">571</tt> <tt class="py-line"><tt class="py-docstring"> group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]).</tt> </tt>
+<a name="L572"></a><tt class="py-lineno">572</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L573"></a><tt class="py-lineno">573</tt> <tt class="py-line"><tt class="py-docstring"> Users provide three functions:</tt> </tt>
+<a name="L574"></a><tt class="py-lineno">574</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L575"></a><tt class="py-lineno">575</tt> <tt class="py-line"><tt class="py-docstring"> - C{createCombiner}, which turns a V into a C (e.g., creates</tt> </tt>
+<a name="L576"></a><tt class="py-lineno">576</tt> <tt class="py-line"><tt class="py-docstring"> a one-element list)</tt> </tt>
+<a name="L577"></a><tt class="py-lineno">577</tt> <tt class="py-line"><tt class="py-docstring"> - C{mergeValue}, to merge a V into a C (e.g., adds it to the end of</tt> </tt>
+<a name="L578"></a><tt class="py-lineno">578</tt> <tt class="py-line"><tt class="py-docstring"> a list)</tt> </tt>
+<a name="L579"></a><tt class="py-lineno">579</tt> <tt class="py-line"><tt class="py-docstring"> - C{mergeCombiners}, to combine two C's into a single one.</tt> </tt>
+<a name="L580"></a><tt class="py-lineno">580</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L581"></a><tt class="py-lineno">581</tt> <tt class="py-line"><tt class="py-docstring"> In addition, users can control the partitioning of the output RDD.</tt> </tt>
+<a name="L582"></a><tt class="py-lineno">582</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L583"></a><tt class="py-lineno">583</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
+<a name="L584"></a><tt class="py-lineno">584</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; def f(x): return x</tt> </tt>
+<a name="L585"></a><tt class="py-lineno">585</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; def add(a, b): return a + str(b)</tt> </tt>
+<a name="L586"></a><tt class="py-lineno">586</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(x.combineByKey(str, add, add).collect())</tt> </tt>
+<a name="L587"></a><tt class="py-lineno">587</tt> <tt class="py-line"><tt class="py-docstring"> [('a', '11'), ('b', '1')]</tt> </tt>
+<a name="L588"></a><tt class="py-lineno">588</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L589"></a><tt class="py-lineno">589</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
+<a name="L590"></a><tt class="py-lineno">590</tt> <tt class="py-line"> <tt class="py-name">numPartitions</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-72" class="py-name"><a title="pyspark.context.SparkContext.defaultParallelism" class="py-name" href="#" onclick="return doclink('link-72', 'defaultParallelism', 'link-63');">defaultParallelism</a></tt> </tt>
+<a name="L591"></a><tt class="py-lineno">591</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">combineLocally</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L592"></a><tt class="py-lineno">592</tt> <tt class="py-line"> <tt class="py-name">combiners</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt>
+<a name="L593"></a><tt class="py-lineno">593</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L594"></a><tt class="py-lineno">594</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">k</tt> <tt class="py-keyword">not</tt> <tt class="py-keyword">in</tt> <tt class="py-name">combiners</tt><tt class="py-op">:</tt> </tt>
+<a name="L595"></a><tt class="py-lineno">595</tt> <tt class="py-line"> <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">createCombiner</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
+<a name="L596"></a><tt class="py-lineno">596</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
+<a name="L597"></a><tt class="py-lineno">597</tt> <tt class="py-line"> <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">mergeValue</tt><tt class="py-op">(</tt><tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
+<a name="L598"></a><tt class="py-lineno">598</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">combiners</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L599"></a><tt class="py-lineno">599</tt> <tt class="py-line"> <tt class="py-name">locally_combined</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-73" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-73', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">combineLocally</tt><tt class="py-op">)</tt> </tt>
+<a name="L600"></a><tt class="py-lineno">600</tt> <tt class="py-line"> <tt class="py-name">shuffled</tt> <tt class="py-op">=</tt> <tt class="py-name">locally_combined</tt><tt class="py-op">.</tt><tt id="link-74" class="py-name"><a title="pyspark.rdd.RDD.partitionBy" class="py-name" href="#" onclick="return doclink('link-74', 'partitionBy', 'link-67');">partitionBy</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+<a name="L601"></a><tt class="py-lineno">601</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">_mergeCombiners</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L602"></a><tt class="py-lineno">602</tt> <tt class="py-line"> <tt class="py-name">combiners</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt>
+<a name="L603"></a><tt class="py-lineno">603</tt> <tt class="py-line"> <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
+<a name="L604"></a><tt class="py-lineno">604</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">k</tt> <tt class="py-keyword">in</tt> <tt class="py-name">combiners</tt><tt class="py-op">:</tt> </tt>
+<a name="L605"></a><tt class="py-lineno">605</tt> <tt class="py-line"> <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">v</tt> </tt>
+<a name="L606"></a><tt class="py-lineno">606</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
+<a name="L607"></a><tt class="py-lineno">607</tt> <tt class="py-line"> <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">mergeCombiners</tt><tt class="py-op">(</tt><tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
+<a name="L608"></a><tt class="py-lineno">608</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">combiners</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L609"></a><tt class="py-lineno">609</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">shuffled</tt><tt class="py-op">.</tt><tt id="link-75" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-75', 'mapPartitions', 'link-14');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">_mergeCombiners</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L610"></a><tt class="py-lineno">610</tt> <tt class="py-line"> </tt>
+<a name="L611"></a><tt class="py-lineno">611</tt> <tt class="py-line"> <tt class="py-comment"># TODO: support variant with custom partitioner</tt> </tt>
+<a name="RDD.groupByKey"></a><div id="RDD.groupByKey-def"><a name="L612"></a><tt class="py-lineno">612</tt> <a class="py-toggle" href="#" id="RDD.groupByKey-toggle" onclick="return toggle('RDD.groupByKey');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#groupByKey">groupByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.groupByKey-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.groupByKey-expanded"><a name="L613"></a><tt class="py-lineno">613</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L614"></a><tt class="py-lineno">614</tt> <tt class="py-line"><tt class="py-docstring"> Group the values for each key in the RDD into a single sequence.</tt> </tt>
+<a name="L615"></a><tt class="py-lineno">615</tt> <tt class="py-line"><tt class="py-docstring"> Hash-partitions the resulting RDD with into numPartitions partitions.</tt> </tt>
+<a name="L616"></a><tt class="py-lineno">616</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L617"></a><tt class="py-lineno">617</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
+<a name="L618"></a><tt class="py-lineno">618</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(x.groupByKey().collect())</tt> </tt>
+<a name="L619"></a><tt class="py-lineno">619</tt> <tt class="py-line"><tt class="py-docstring"> [('a', [1, 1]), ('b', [1])]</tt> </tt>
+<a name="L620"></a><tt class="py-lineno">620</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L621"></a><tt class="py-lineno">621</tt> <tt class="py-line"> </tt>
+<a name="L622"></a><tt class="py-lineno">622</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">createCombiner</tt><tt class="py-op">(</tt><tt class="py-param">x</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L623"></a><tt class="py-lineno">623</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-op">[</tt><tt class="py-name">x</tt><tt class="py-op">]</tt> </tt>
+</div><a name="L624"></a><tt class="py-lineno">624</tt> <tt class="py-line"> </tt>
+<a name="L625"></a><tt class="py-lineno">625</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeValue</tt><tt class="py-op">(</tt><tt class="py-param">xs</tt><tt class="py-op">,</tt> <tt class="py-param">x</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L626"></a><tt class="py-lineno">626</tt> <tt class="py-line"> <tt class="py-name">xs</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
+<a name="L627"></a><tt class="py-lineno">627</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">xs</tt> </tt>
+</div><a name="L628"></a><tt class="py-lineno">628</tt> <tt class="py-line"> </tt>
+<a name="L629"></a><tt class="py-lineno">629</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeCombiners</tt><tt class="py-op">(</tt><tt class="py-param">a</tt><tt class="py-op">,</tt> <tt class="py-param">b</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L630"></a><tt class="py-lineno">630</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">a</tt> <tt class="py-op">+</tt> <tt class="py-name">b</tt> </tt>
+</div><a name="L631"></a><tt class="py-lineno">631</tt> <tt class="py-line"> </tt>
+<a name="L632"></a><tt class="py-lineno">632</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-76" class="py-name"><a title="pyspark.rdd.RDD.combineByKey" class="py-name" href="#" onclick="return doclink('link-76', 'combineByKey', 'link-58');">combineByKey</a></tt><tt class="py-op">(</tt><tt class="py-name">createCombiner</tt><tt class="py-op">,</tt> <tt class="py-name">mergeValue</tt><tt class="py-op">,</tt> <tt class="py-name">mergeCombiners</tt><tt class="py-op">,</tt> </tt>
+<a name="L633"></a><tt class="py-lineno">633</tt> <tt class="py-line"> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L634"></a><tt class="py-lineno">634</tt> <tt class="py-line"> </tt>
+<a name="L635"></a><tt class="py-lineno">635</tt> <tt class="py-line"> <tt class="py-comment"># TODO: add tests</tt> </tt>
+<a name="RDD.flatMapValues"></a><div id="RDD.flatMapValues-def"><a name="L636"></a><tt class="py-lineno">636</tt> <a class="py-toggle" href="#" id="RDD.flatMapValues-toggle" onclick="return toggle('RDD.flatMapValues');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#flatMapValues">flatMapValues</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.flatMapValues-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.flatMapValues-expanded"><a name="L637"></a><tt class="py-lineno">637</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L638"></a><tt class="py-lineno">638</tt> <tt class="py-line"><tt class="py-docstring"> Pass each value in the key-value pair RDD through a flatMap function</tt> </tt>
+<a name="L639"></a><tt class="py-lineno">639</tt> <tt class="py-line"><tt class="py-docstring"> without changing the keys; this also retains the original RDD's</tt> </tt>
+<a name="L640"></a><tt class="py-lineno">640</tt> <tt class="py-line"><tt class="py-docstring"> partitioning.</tt> </tt>
+<a name="L641"></a><tt class="py-lineno">641</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L642"></a><tt class="py-lineno">642</tt> <tt class="py-line"> <tt class="py-name">flat_map_fn</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+<a name="L643"></a><tt class="py-lineno">643</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-77" class="py-name"><a title="pyspark.rdd.RDD.flatMap" class="py-name" href="#" onclick="return doclink('link-77', 'flatMap', 'link-25');">flatMap</a></tt><tt class="py-op">(</tt><tt class="py-name">flat_map_fn</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L644"></a><tt class="py-lineno">644</tt> <tt class="py-line"> </tt>
+<a name="RDD.mapValues"></a><div id="RDD.mapValues-def"><a name="L645"></a><tt class="py-lineno">645</tt> <a class="py-toggle" href="#" id="RDD.mapValues-toggle" onclick="return toggle('RDD.mapValues');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapValues">mapValues</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.mapValues-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.mapValues-expanded"><a name="L646"></a><tt class="py-lineno">646</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L647"></a><tt class="py-lineno">647</tt> <tt class="py-line"><tt class="py-docstring"> Pass each value in the key-value pair RDD through a map function</tt> </tt>
+<a name="L648"></a><tt class="py-lineno">648</tt> <tt class="py-line"><tt class="py-docstring"> without changing the keys; this also retains the original RDD's</tt> </tt>
+<a name="L649"></a><tt class="py-lineno">649</tt> <tt class="py-line"><tt class="py-docstring"> partitioning.</tt> </tt>
+<a name="L650"></a><tt class="py-lineno">650</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L651"></a><tt class="py-lineno">651</tt> <tt class="py-line"> <tt class="py-name">map_values_fn</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+<a name="L652"></a><tt class="py-lineno">652</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-78" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-78', 'map', 'link-15');">map</a></tt><tt class="py-op">(</tt><tt class="py-name">map_values_fn</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L653"></a><tt class="py-lineno">653</tt> <tt class="py-line"> </tt>
+<a name="L654"></a><tt class="py-lineno">654</tt> <tt class="py-line"> <tt class="py-comment"># TODO: support varargs cogroup of several RDDs.</tt> </tt>
+<a name="RDD.groupWith"></a><div id="RDD.groupWith-def"><a name="L655"></a><tt class="py-lineno">655</tt> <a class="py-toggle" href="#" id="RDD.groupWith-toggle" onclick="return toggle('RDD.groupWith');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#groupWith">groupWith</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.groupWith-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.groupWith-expanded"><a name="L656"></a><tt class="py-lineno">656</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L657"></a><tt class="py-lineno">657</tt> <tt class="py-line"><tt class="py-docstring"> Alias for cogroup.</tt> </tt>
+<a name="L658"></a><tt class="py-lineno">658</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L659"></a><tt class="py-lineno">659</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-79" class="py-name" targets="Method pyspark.rdd.RDD.cogroup()=pyspark.rdd.RDD-class.html#cogroup"><a title="pyspark.rdd.RDD.cogroup" class="py-name" href="#" onclick="return doclink('link-79', 'cogroup', 'link-79');">cogroup</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L660"></a><tt class="py-lineno">660</tt> <tt class="py-line"> </tt>
+<a name="L661"></a><tt class="py-lineno">661</tt> <tt class="py-line"> <tt class="py-comment"># TODO: add variant with custom parittioner</tt> </tt>
+<a name="RDD.cogroup"></a><div id="RDD.cogroup-def"><a name="L662"></a><tt class="py-lineno">662</tt> <a class="py-toggle" href="#" id="RDD.cogroup-toggle" onclick="return toggle('RDD.cogroup');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#cogroup">cogroup</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="RDD.cogroup-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="RDD.cogroup-expanded"><a name="L663"></a><tt class="py-lineno">663</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L664"></a><tt class="py-lineno">664</tt> <tt class="py-line"><tt class="py-docstring"> For each key k in C{self} or C{other}, return a resulting RDD that</tt> </tt>
+<a name="L665"></a><tt class="py-lineno">665</tt> <tt class="py-line"><tt class="py-docstring"> contains a tuple with the list of values for that key in C{self} as well</tt> </tt>
+<a name="L666"></a><tt class="py-lineno">666</tt> <tt class="py-line"><tt class="py-docstring"> as C{other}.</tt> </tt>
+<a name="L667"></a><tt class="py-lineno">667</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L668"></a><tt class="py-lineno">668</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
+<a name="L669"></a><tt class="py-lineno">669</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; y = sc.parallelize([("a", 2)])</tt> </tt>
+<a name="L670"></a><tt class="py-lineno">670</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; sorted(x.cogroup(y).collect())</tt> </tt>
+<a name="L671"></a><tt class="py-lineno">671</tt> <tt class="py-line"><tt class="py-docstring"> [('a', ([1], [2])), ('b', ([4], []))]</tt> </tt>
+<a name="L672"></a><tt class="py-lineno">672</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="L673"></a><tt class="py-lineno">673</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">python_cogroup</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
+</div></div><a name="L674"></a><tt class="py-lineno">674</tt> <tt class="py-line"> </tt>
+<a name="PipelinedRDD"></a><div id="PipelinedRDD-def"><a name="L675"></a><tt class="py-lineno">675</tt> <tt class="py-line"> <tt class="py-comment"># TODO: `lookup` is disabled because we can't make direct comparisons based</tt> </tt>
+<a name="L676"></a><tt class="py-lineno">676</tt> <tt class="py-line"> <tt class="py-comment"># on the key; we need to compare the hash of the key to the hash of the</tt> </tt>
+<a name="L677"></a><tt class="py-lineno">677</tt> <tt class="py-line"> <tt class="py-comment"># keys in the pairs. This could be an expensive operation, since those</tt> </tt>
+<a name="L678"></a><tt class="py-lineno">678</tt> <tt class="py-line"> <tt class="py-comment"># hashes aren't retained.</tt> </tt>
+<a name="L679"></a><tt class="py-lineno">679</tt> <tt class="py-line"> </tt>
+<a name="L680"></a><tt class="py-lineno">680</tt> <tt class="py-line"> </tt>
+<a name="L681"></a><tt class="py-lineno">681</tt> <a class="py-toggle" href="#" id="PipelinedRDD-toggle" onclick="return toggle('PipelinedRDD');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html">PipelinedRDD</a><tt class="py-op">(</tt><tt class="py-base-class">RDD</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="PipelinedRDD-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="PipelinedRDD-expanded"><a name="L682"></a><tt class="py-lineno">682</tt> <tt class="py-line"> <tt class="py-docstring">"""</tt> </tt>
+<a name="L683"></a><tt class="py-lineno">683</tt> <tt class="py-line"><tt class="py-docstring"> Pipelined maps:</tt> </tt>
+<a name="L684"></a><tt class="py-lineno">684</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4])</tt> </tt>
+<a name="L685"></a><tt class="py-lineno">685</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()</tt> </tt>
+<a name="L686"></a><tt class="py-lineno">686</tt> <tt class="py-line"><tt class="py-docstring"> [4, 8, 12, 16]</tt> </tt>
+<a name="L687"></a><tt class="py-lineno">687</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect()</tt> </tt>
+<a name="L688"></a><tt class="py-lineno">688</tt> <tt class="py-line"><tt class="py-docstring"> [4, 8, 12, 16]</tt> </tt>
+<a name="L689"></a><tt class="py-lineno">689</tt> <tt class="py-line"><tt class="py-docstring"></tt> </tt>
+<a name="L690"></a><tt class="py-lineno">690</tt> <tt class="py-line"><tt class="py-docstring"> Pipelined reduces:</tt> </tt>
+<a name="L691"></a><tt class="py-lineno">691</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; from operator import add</tt> </tt>
+<a name="L692"></a><tt class="py-lineno">692</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.map(lambda x: 2 * x).reduce(add)</tt> </tt>
+<a name="L693"></a><tt class="py-lineno">693</tt> <tt class="py-line"><tt class="py-docstring"> 20</tt> </tt>
+<a name="L694"></a><tt class="py-lineno">694</tt> <tt class="py-line"><tt class="py-docstring"> &gt;&gt;&gt; rdd.flatMap(lambda x: [x, x]).reduce(add)</tt> </tt>
+<a name="L695"></a><tt class="py-lineno">695</tt> <tt class="py-line"><tt class="py-docstring"> 20</tt> </tt>
+<a name="L696"></a><tt class="py-lineno">696</tt> <tt class="py-line"><tt class="py-docstring"> """</tt> </tt>
+<a name="PipelinedRDD.__init__"></a><div id="PipelinedRDD.__init__-def"><a name="L697"></a><tt class="py-lineno">697</tt> <a class="py-toggle" href="#" id="PipelinedRDD.__init__-toggle" onclick="return toggle('PipelinedRDD.__init__');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">prev</tt><tt class="py-op">,</tt> <tt class="py-param">func</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="PipelinedRDD.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="PipelinedRDD.__init__-expanded"><a name="L698"></a><tt class="py-lineno">698</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">prev</tt><tt class="py-op">,</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">)</tt> <tt class="py-keyword">and</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_is_pipelinable</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L699"></a><tt class="py-lineno">699</tt> <tt class="py-line"> <tt class="py-name">prev_func</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> </tt>
+<a name="L700"></a><tt class="py-lineno">700</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">pipeline_func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L701"></a><tt class="py-lineno">701</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">func</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">prev_func</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L702"></a><tt class="py-lineno">702</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> <tt class="py-op">=</tt> <tt class="py-name">pipeline_func</tt> </tt>
+<a name="L703"></a><tt class="py-lineno">703</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt> <tt class="py-op">=</tt> \ </tt>
+<a name="L704"></a><tt class="py-lineno">704</tt> <tt class="py-line"> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt> <tt class="py-keyword">and</tt> <tt class="py-name">preservesPartitioning</tt> </tt>
+<a name="L705"></a><tt class="py-lineno">705</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt> </tt>
+<a name="L706"></a><tt class="py-lineno">706</tt> <tt class="py-line"> <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
+<a name="L707"></a><tt class="py-lineno">707</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> <tt class="py-op">=</tt> <tt class="py-name">func</tt> </tt>
+<a name="L708"></a><tt class="py-lineno">708</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt> <tt class="py-op">=</tt> <tt class="py-name">preservesPartitioning</tt> </tt>
+<a name="L709"></a><tt class="py-lineno">709</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt> </tt>
+<a name="L710"></a><tt class="py-lineno">710</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
+<a name="L711"></a><tt class="py-lineno">711</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
+<a name="L712"></a><tt class="py-lineno">712</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> </tt>
+<a name="L713"></a><tt class="py-lineno">713</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">prev</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt> </tt>
+<a name="L714"></a><tt class="py-lineno">714</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt> </tt>
+<a name="L715"></a><tt class="py-lineno">715</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
+</div><a name="L716"></a><tt class="py-lineno">716</tt> <tt class="py-line"> </tt>
+<a name="L717"></a><tt class="py-lineno">717</tt> <tt class="py-line"> <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
+<a name="PipelinedRDD._jrdd"></a><div id="PipelinedRDD._jrdd-def"><a name="L718"></a><tt class="py-lineno">718</tt> <a class="py-toggle" href="#" id="PipelinedRDD._jrdd-toggle" onclick="return toggle('PipelinedRDD._jrdd');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html#_jrdd">_jrdd</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="PipelinedRDD._jrdd-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="PipelinedRDD._jrdd-expanded"><a name="L719"></a><tt class="py-lineno">719</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt><tt class="py-op">:</tt> </tt>
+<a name="L720"></a><tt class="py-lineno">720</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> </tt>
+<a name="L721"></a><tt class="py-lineno">721</tt> <tt class="py-line"> <tt class="py-name">func</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> </tt>
+<a name="L722"></a><tt class="py-lineno">722</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-keyword">and</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">batchSize</tt> <tt class="py-op">!=</tt> <tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
+<a name="L723"></a><tt class="py-lineno">723</tt> <tt class="py-line"> <tt class="py-name">oldfunc</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> </tt>
+<a name="L724"></a><tt class="py-lineno">724</tt> <tt class="py-line"> <tt class="py-name">batchSize</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">batchSize</tt> </tt>
+<a name="L725"></a><tt class="py-lineno">725</tt> <tt class="py-line"> <tt class="py-keyword">def</tt> <tt class="py-def-name">batched_func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+<a name="L726"></a><tt class="py-lineno">726</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">batched</tt><tt class="py-op">(</tt><tt class="py-name">oldfunc</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">batchSize</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L727"></a><tt class="py-lineno">727</tt> <tt class="py-line"> <tt class="py-name">func</tt> <tt class="py-op">=</tt> <tt class="py-name">batched_func</tt> </tt>
+<a name="L728"></a><tt class="py-lineno">728</tt> <tt class="py-line"> <tt class="py-name">cmds</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt><tt class="py-op">]</tt> </tt>
+<a name="L729"></a><tt class="py-lineno">729</tt> <tt class="py-line"> <tt class="py-name">pipe_command</tt> <tt class="py-op">=</tt> <tt class="py-string">' '</tt><tt class="py-op">.</tt><tt id="link-80" class="py-name"><a title="pyspark.rdd.RDD.join" class="py-name" href="#" onclick="return doclink('link-80', 'join', 'link-3');">join</a></tt><tt class="py-op">(</tt><tt class="py-name">b64enc</tt><tt class="py-op">(</tt><tt class="py-name">cloudpickle</tt><tt class="py-op">.</tt><tt class="py-name">dumps</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">f</tt> <tt class="py-keyword">in</tt> <tt class="py-name">cmds</tt><tt class="py-op">)</tt> </tt>
+<a name="L730"></a><tt class="py-lineno">730</tt> <tt class="py-line"> <tt class="py-name">broadcast_vars</tt> <tt class="py-op">=</tt> <tt class="py-name">ListConverter</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">convert</tt><tt class="py-op">(</tt> </tt>
+<a name="L731"></a><tt class="py-lineno">731</tt> <tt class="py-line"> <tt class="py-op">[</tt><tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">_jbroadcast</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_pickled_broadcast_vars</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> </tt>
+<a name="L732"></a><tt class="py-lineno">732</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-81" class="py-name" targets="Variable pyspark.context.SparkContext._gateway=pyspark.context.SparkContext-class.html#_gateway"><a title="pyspark.context.SparkContext._gateway" class="py-name" href="#" onclick="return doclink('link-81', '_gateway', 'link-81');">_gateway</a></tt><tt class="py-op">.</tt><tt class="py-name">_gateway_client</tt><tt class="py-op">)</tt> </tt>
+<a name="L733"></a><tt class="py-lineno">733</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_pickled_broadcast_vars</tt><tt class="py-op">.</tt><tt class="py-name">clear</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L734"></a><tt class="py-lineno">734</tt> <tt class="py-line"> <tt class="py-name">class_manifest</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt><tt class="py-op">.</tt><tt class="py-name">classManifest</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L735"></a><tt class="py-lineno">735</tt> <tt class="py-line"> <tt class="py-name">env</tt> <tt class="py-op">=</tt> <tt class="py-name">copy</tt><tt class="py-op">.</tt><tt class="py-name">copy</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">environment</tt><tt class="py-op">)</tt> </tt>
+<a name="L736"></a><tt class="py-lineno">736</tt> <tt class="py-line"> <tt class="py-name">env</tt><tt class="py-op">[</tt><tt class="py-string">'PYTHONPATH'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">os</tt><tt class="py-op">.</tt><tt class="py-name">environ</tt><tt class="py-op">.</tt><tt id="link-82" class="py-name"><a title="pyspark.files.SparkFiles.get" class="py-name" href="#" onclick="return doclink('link-82', 'get', 'link-11');">get</a></tt><tt class="py-op">(</tt><tt class="py-string">"PYTHONPATH"</tt><tt class="py-op">,</tt> <tt class="py-string">""</tt><tt class="py-op">)</tt> </tt>
+<a name="L737"></a><tt class="py-lineno">737</tt> <tt class="py-line"> <tt class="py-name">env</tt> <tt class="py-op">=</tt> <tt class="py-name">MapConverter</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">convert</tt><tt class="py-op">(</tt><tt class="py-name">env</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-83" class="py-name"><a title="pyspark.context.SparkContext._gateway" class="py-name" href="#" onclick="return doclink('link-83', '_gateway', 'link-81');">_gateway</a></tt><tt class="py-op">.</tt><tt class="py-name">_gateway_client</tt><tt class="py-op">)</tt> </tt>
+<a name="L738"></a><tt class="py-lineno">738</tt> <tt class="py-line"> <tt class="py-name">python_rdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-84" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-84', '_jvm', 'link-55');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PythonRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt><tt class="py-op">.</tt><tt id="link-85" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-85', 'rdd', 'link-5');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt>
+<a name="L739"></a><tt class="py-lineno">739</tt> <tt class="py-line"> <tt class="py-name">pipe_command</tt><tt class="py-op">,</tt> <tt class="py-name">env</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">pythonExec</tt><tt class="py-op">,</tt> </tt>
+<a name="L740"></a><tt class="py-lineno">740</tt> <tt class="py-line"> <tt class="py-name">broadcast_vars</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_javaAccumulator</tt><tt class="py-op">,</tt> <tt class="py-name">class_manifest</tt><tt class="py-op">)</tt> </tt>
+<a name="L741"></a><tt class="py-lineno">741</tt> <tt class="py-line"> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> <tt class="py-op">=</tt> <tt class="py-name">python_rdd</tt><tt class="py-op">.</tt><tt class="py-name">asJavaRDD</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L742"></a><tt class="py-lineno">742</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> </tt>
+</div><a name="L743"></a><tt class="py-lineno">743</tt> <tt class="py-line"> </tt>
+<a name="PipelinedRDD._is_pipelinable"></a><div id="PipelinedRDD._is_pipelinable-def"><a name="L744"></a><tt class="py-lineno">744</tt> <a class="py-toggle" href="#" id="PipelinedRDD._is_pipelinable-toggle" onclick="return toggle('PipelinedRDD._is_pipelinable');">-</a><tt class="py-line"> <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html#_is_pipelinable">_is_pipelinable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="PipelinedRDD._is_pipelinable-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="PipelinedRDD._is_pipelinable-expanded"><a name="L745"></a><tt class="py-lineno">745</tt> <tt class="py-line"> <tt class="py-keyword">return</tt> <tt class="py-keyword">not</tt> <tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-keyword">or</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt><tt class="py-op">)</tt> </tt>
+</div></div><a name="L746"></a><tt class="py-lineno">746</tt> <tt class="py-line"> </tt>
+<a name="_test"></a><div id="_test-def"><a name="L747"></a><tt class="py-lineno">747</tt> <tt class="py-line"> </tt>
+<a name="L748"></a><tt class="py-lineno">748</tt> <a class="py-toggle" href="#" id="_test-toggle" onclick="return toggle('_test');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd-module.html#_test">_test</a><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
+</div><div id="_test-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="_test-expanded"><a name="L749"></a><tt class="py-lineno">749</tt> <tt class="py-line"> <tt class="py-keyword">import</tt> <tt class="py-name">doctest</tt> </tt>
+<a name="L750"></a><tt class="py-lineno">750</tt> <tt class="py-line"> <tt class="py-keyword">from</tt> <tt id="link-86" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-86', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-87" class="py-name" targets="Module pyspark.context=pyspark.context-module.html,Method pyspark.rdd.RDD.context()=pyspark.rdd.RDD-class.html#context"><a title="pyspark.context
+pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-87', 'context', 'link-87');">context</a></tt> <tt class="py-keyword">import</tt> <tt id="link-88" class="py-name" targets="Class pyspark.context.SparkContext=pyspark.context.SparkContext-class.html"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-88', 'SparkContext', 'link-88');">SparkContext</a></tt> </tt>
+<a name="L751"></a><tt class="py-lineno">751</tt> <tt class="py-line"> <tt class="py-name">globs</tt> <tt class="py-op">=</tt> <tt class="py-name">globals</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">copy</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L752"></a><tt class="py-lineno">752</tt> <tt class="py-line"> <tt class="py-comment"># The small batch size here ensures that we see multiple batches,</tt> </tt>
+<a name="L753"></a><tt class="py-lineno">753</tt> <tt class="py-line"> <tt class="py-comment"># even in these small test examples:</tt> </tt>
+<a name="L754"></a><tt class="py-lineno">754</tt> <tt class="py-line"> <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt id="link-89" class="py-name"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-89', 'SparkContext', 'link-88');">SparkContext</a></tt><tt class="py-op">(</tt><tt class="py-string">'local[4]'</tt><tt class="py-op">,</tt> <tt class="py-string">'PythonTest'</tt><tt class="py-op">,</tt> <tt class="py-name">batchSize</tt><tt class="py-op">=</tt><tt class="py-number">2</tt><tt class="py-op">)</tt> </tt>
+<a name="L755"></a><tt class="py-lineno">755</tt> <tt class="py-line"> <tt class="py-op">(</tt><tt class="py-name">failure_count</tt><tt class="py-op">,</tt> <tt class="py-name">test_count</tt><tt class="py-op">)</tt> <tt class="py-op">=</tt> <tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">testmod</tt><tt class="py-op">(</tt><tt class="py-name">globs</tt><tt class="py-op">=</tt><tt class="py-name">globs</tt><tt class="py-op">)</tt> </tt>
+<a name="L756"></a><tt class="py-lineno">756</tt> <tt class="py-line"> <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt id="link-90" class="py-name" targets="Method pyspark.context.SparkContext.stop()=pyspark.context.SparkContext-class.html#stop"><a title="pyspark.context.SparkContext.stop" class="py-name" href="#" onclick="return doclink('link-90', 'stop', 'link-90');">stop</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L757"></a><tt class="py-lineno">757</tt> <tt class="py-line"> <tt class="py-keyword">if</tt> <tt class="py-name">failure_count</tt><tt class="py-op">:</tt> </tt>
+<a name="L758"></a><tt class="py-lineno">758</tt> <tt class="py-line"> <tt class="py-name">exit</tt><tt class="py-op">(</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
+</div><a name="L759"></a><tt class="py-lineno">759</tt> <tt class="py-line"> </tt>
+<a name="L760"></a><tt class="py-lineno">760</tt> <tt class="py-line"> </tt>
+<a name="L761"></a><tt class="py-lineno">761</tt> <tt class="py-line"><tt class="py-keyword">if</tt> <tt class="py-name">__name__</tt> <tt class="py-op">==</tt> <tt class="py-string">"__main__"</tt><tt class="py-op">:</tt> </tt>
+<a name="L762"></a><tt class="py-lineno">762</tt> <tt class="py-line"> <tt class="py-name">_test</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
+<a name="L763"></a><tt class="py-lineno">763</tt> <tt class="py-line"> </tt><script type="text/javascript">
+<!--
+expandto(location.href);
+// -->
+</script>
+</pre>
+<br />
+<!-- ==================== NAVIGATION BAR ==================== -->
+<table class="navbar" border="0" width="100%" cellpadding="0"
+ bgcolor="#a0c0ff" cellspacing="0">
+ <tr valign="middle">
+ <!-- Home link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Tree link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Index link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Help link -->
+ <th>&nbsp;&nbsp;&nbsp;<a
+ href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>
+
+ <!-- Project homepage -->
+ <th class="navbar" align="right" width="100%">
+ <table border="0" cellpadding="0" cellspacing="0">
+ <tr><th class="navbar" align="center"
+ ><a class="navbar" target="_top" href="http://spark-project.org">PySpark</a></th>
+ </tr></table></th>
+ </tr>
+</table>
+<table border="0" cellpadding="0" cellspacing="0" width="100%%">
+ <tr>
+ <td align="left" class="footer">
+ Generated by Epydoc 3.0.1 on Tue Feb 26 22:47:39 2013
+ </td>
+ <td align="right" class="footer">
+ <a target="mainFrame" href="http://epydoc.sourceforge.net"
+ >http://epydoc.sourceforge.net</a>
+ </td>
+ </tr>
+</table>
+
+<script type="text/javascript">
+ <!--
+ // Private objects are initially displayed (because if
+ // javascript is turned off then we want them to be
+ // visible); but by default, we want to hide them. So hide
+ // them unless we have a cookie that says to show them.
+ checkCookie();
+ // -->
+</script>
+</body>
+</html>