summaryrefslogtreecommitdiff
path: root/site/docs/0.9.0/api/pyspark/pyspark.rdd-pysrc.html
blob: 33da32759535b8a2ac4292c44d96b152ef85a38f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
<?xml version="1.0" encoding="ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
          "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <title>pyspark.rdd</title>
  <link rel="stylesheet" href="epydoc.css" type="text/css" />
  <script type="text/javascript" src="epydoc.js"></script>
</head>

<body bgcolor="white" text="black" link="blue" vlink="#204080"
      alink="#204080">
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://spark-project.org">PySpark</a></th>
          </tr></table></th>
  </tr>
</table>
<table width="100%" cellpadding="0" cellspacing="0">
  <tr valign="top">
    <td width="100%">
      <span class="breadcrumbs">
        <a href="pyspark-module.html">Package&nbsp;pyspark</a> ::
        Module&nbsp;rdd
      </span>
    </td>
    <td>
      <table cellpadding="0" cellspacing="0">
        <!-- hide/show private -->
        <tr><td align="right"><span class="options"
            >[<a href="frames.html" target="_top">frames</a
            >]&nbsp;|&nbsp;<a href="pyspark.rdd-pysrc.html"
            target="_top">no&nbsp;frames</a>]</span></td></tr>
      </table>
    </td>
  </tr>
</table>
<h1 class="epydoc">Source Code for <a href="pyspark.rdd-module.html">Module pyspark.rdd</a></h1>
<pre class="py-src">
<a name="L1"></a><tt class="py-lineno">   1</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L2"></a><tt class="py-lineno">   2</tt>  <tt class="py-line"><tt class="py-comment"># Licensed to the Apache Software Foundation (ASF) under one or more</tt> </tt>
<a name="L3"></a><tt class="py-lineno">   3</tt>  <tt class="py-line"><tt class="py-comment"># contributor license agreements.  See the NOTICE file distributed with</tt> </tt>
<a name="L4"></a><tt class="py-lineno">   4</tt>  <tt class="py-line"><tt class="py-comment"># this work for additional information regarding copyright ownership.</tt> </tt>
<a name="L5"></a><tt class="py-lineno">   5</tt>  <tt class="py-line"><tt class="py-comment"># The ASF licenses this file to You under the Apache License, Version 2.0</tt> </tt>
<a name="L6"></a><tt class="py-lineno">   6</tt>  <tt class="py-line"><tt class="py-comment"># (the "License"); you may not use this file except in compliance with</tt> </tt>
<a name="L7"></a><tt class="py-lineno">   7</tt>  <tt class="py-line"><tt class="py-comment"># the License.  You may obtain a copy of the License at</tt> </tt>
<a name="L8"></a><tt class="py-lineno">   8</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L9"></a><tt class="py-lineno">   9</tt>  <tt class="py-line"><tt class="py-comment">#    http://www.apache.org/licenses/LICENSE-2.0</tt> </tt>
<a name="L10"></a><tt class="py-lineno">  10</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L11"></a><tt class="py-lineno">  11</tt>  <tt class="py-line"><tt class="py-comment"># Unless required by applicable law or agreed to in writing, software</tt> </tt>
<a name="L12"></a><tt class="py-lineno">  12</tt>  <tt class="py-line"><tt class="py-comment"># distributed under the License is distributed on an "AS IS" BASIS,</tt> </tt>
<a name="L13"></a><tt class="py-lineno">  13</tt>  <tt class="py-line"><tt class="py-comment"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</tt> </tt>
<a name="L14"></a><tt class="py-lineno">  14</tt>  <tt class="py-line"><tt class="py-comment"># See the License for the specific language governing permissions and</tt> </tt>
<a name="L15"></a><tt class="py-lineno">  15</tt>  <tt class="py-line"><tt class="py-comment"># limitations under the License.</tt> </tt>
<a name="L16"></a><tt class="py-lineno">  16</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L17"></a><tt class="py-lineno">  17</tt>  <tt class="py-line"> </tt>
<a name="L18"></a><tt class="py-lineno">  18</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">base64</tt> <tt class="py-keyword">import</tt> <tt class="py-name">standard_b64encode</tt> <tt class="py-keyword">as</tt> <tt class="py-name">b64enc</tt> </tt>
<a name="L19"></a><tt class="py-lineno">  19</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt id="link-0" class="py-name" targets="Method pyspark.statcounter.StatCounter.copy()=pyspark.statcounter.StatCounter-class.html#copy"><a title="pyspark.statcounter.StatCounter.copy" class="py-name" href="#" onclick="return doclink('link-0', 'copy', 'link-0');">copy</a></tt> </tt>
<a name="L20"></a><tt class="py-lineno">  20</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">collections</tt> <tt class="py-keyword">import</tt> <tt class="py-name">defaultdict</tt> </tt>
<a name="L21"></a><tt class="py-lineno">  21</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">itertools</tt> <tt class="py-keyword">import</tt> <tt class="py-name">chain</tt><tt class="py-op">,</tt> <tt class="py-name">ifilter</tt><tt class="py-op">,</tt> <tt class="py-name">imap</tt> </tt>
<a name="L22"></a><tt class="py-lineno">  22</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">operator</tt> </tt>
<a name="L23"></a><tt class="py-lineno">  23</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">os</tt> </tt>
<a name="L24"></a><tt class="py-lineno">  24</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">sys</tt> </tt>
<a name="L25"></a><tt class="py-lineno">  25</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">shlex</tt> </tt>
<a name="L26"></a><tt class="py-lineno">  26</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">traceback</tt> </tt>
<a name="L27"></a><tt class="py-lineno">  27</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">subprocess</tt> <tt class="py-keyword">import</tt> <tt class="py-name">Popen</tt><tt class="py-op">,</tt> <tt class="py-name">PIPE</tt> </tt>
<a name="L28"></a><tt class="py-lineno">  28</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">tempfile</tt> <tt class="py-keyword">import</tt> <tt class="py-name">NamedTemporaryFile</tt> </tt>
<a name="L29"></a><tt class="py-lineno">  29</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">threading</tt> <tt class="py-keyword">import</tt> <tt class="py-name">Thread</tt> </tt>
<a name="L30"></a><tt class="py-lineno">  30</tt>  <tt class="py-line"><tt class="py-keyword">import</tt> <tt class="py-name">warnings</tt> </tt>
<a name="L31"></a><tt class="py-lineno">  31</tt>  <tt class="py-line"> </tt>
<a name="L32"></a><tt class="py-lineno">  32</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-1" class="py-name" targets="Package pyspark=pyspark-module.html"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-1', 'pyspark', 'link-1');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-2" class="py-name" targets="Module pyspark.serializers=pyspark.serializers-module.html"><a title="pyspark.serializers" class="py-name" href="#" onclick="return doclink('link-2', 'serializers', 'link-2');">serializers</a></tt> <tt class="py-keyword">import</tt> <tt class="py-name">NoOpSerializer</tt><tt class="py-op">,</tt> <tt class="py-name">CartesianDeserializer</tt><tt class="py-op">,</tt> \ </tt>
<a name="L33"></a><tt class="py-lineno">  33</tt>  <tt class="py-line">    <tt class="py-name">BatchedSerializer</tt><tt class="py-op">,</tt> <tt class="py-name">CloudPickleSerializer</tt><tt class="py-op">,</tt> <tt class="py-name">pack_long</tt> </tt>
<a name="L34"></a><tt class="py-lineno">  34</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-3" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-3', 'pyspark', 'link-1');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-4" class="py-name" targets="Method pyspark.rdd.RDD.join()=pyspark.rdd.RDD-class.html#join"><a title="pyspark.rdd.RDD.join" class="py-name" href="#" onclick="return doclink('link-4', 'join', 'link-4');">join</a></tt> <tt class="py-keyword">import</tt> <tt class="py-name">python_join</tt><tt class="py-op">,</tt> <tt class="py-name">python_left_outer_join</tt><tt class="py-op">,</tt> \ </tt>
<a name="L35"></a><tt class="py-lineno">  35</tt>  <tt class="py-line">    <tt class="py-name">python_right_outer_join</tt><tt class="py-op">,</tt> <tt class="py-name">python_cogroup</tt> </tt>
<a name="L36"></a><tt class="py-lineno">  36</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-5" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-5', 'pyspark', 'link-1');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-6" class="py-name" targets="Module pyspark.statcounter=pyspark.statcounter-module.html"><a title="pyspark.statcounter" class="py-name" href="#" onclick="return doclink('link-6', 'statcounter', 'link-6');">statcounter</a></tt> <tt class="py-keyword">import</tt> <tt id="link-7" class="py-name" targets="Class pyspark.statcounter.StatCounter=pyspark.statcounter.StatCounter-class.html"><a title="pyspark.statcounter.StatCounter" class="py-name" href="#" onclick="return doclink('link-7', 'StatCounter', 'link-7');">StatCounter</a></tt> </tt>
<a name="L37"></a><tt class="py-lineno">  37</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-8" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-8', 'pyspark', 'link-1');">pyspark</a></tt><tt class="py-op">.</tt><tt class="py-name">rddsampler</tt> <tt class="py-keyword">import</tt> <tt class="py-name">RDDSampler</tt> </tt>
<a name="L38"></a><tt class="py-lineno">  38</tt>  <tt class="py-line"> </tt>
<a name="L39"></a><tt class="py-lineno">  39</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">py4j</tt><tt class="py-op">.</tt><tt class="py-name">java_collections</tt> <tt class="py-keyword">import</tt> <tt class="py-name">ListConverter</tt><tt class="py-op">,</tt> <tt class="py-name">MapConverter</tt> </tt>
<a name="L40"></a><tt class="py-lineno">  40</tt>  <tt class="py-line"> </tt>
<a name="L41"></a><tt class="py-lineno">  41</tt>  <tt class="py-line"> </tt>
<a name="L42"></a><tt class="py-lineno">  42</tt>  <tt class="py-line"><tt class="py-name">__all__</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-string">"RDD"</tt><tt class="py-op">]</tt> </tt>
<a name="_extract_concise_traceback"></a><div id="_extract_concise_traceback-def"><a name="L43"></a><tt class="py-lineno">  43</tt>  <tt class="py-line"> </tt>
<a name="L44"></a><tt class="py-lineno">  44</tt> <a class="py-toggle" href="#" id="_extract_concise_traceback-toggle" onclick="return toggle('_extract_concise_traceback');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd-module.html#_extract_concise_traceback">_extract_concise_traceback</a><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_extract_concise_traceback-collapsed" style="display:none;" pad="++++" indent="++++"></div><div id="_extract_concise_traceback-expanded"><a name="L45"></a><tt class="py-lineno">  45</tt>  <tt class="py-line">    <tt class="py-name">tb</tt> <tt class="py-op">=</tt> <tt class="py-name">traceback</tt><tt class="py-op">.</tt><tt class="py-name">extract_stack</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L46"></a><tt class="py-lineno">  46</tt>  <tt class="py-line">    <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">tb</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
<a name="L47"></a><tt class="py-lineno">  47</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-string">"I'm lost!"</tt> </tt>
<a name="L48"></a><tt class="py-lineno">  48</tt>  <tt class="py-line">    <tt class="py-comment"># HACK:  This function is in a file called 'rdd.py' in the top level of</tt> </tt>
<a name="L49"></a><tt class="py-lineno">  49</tt>  <tt class="py-line">    <tt class="py-comment"># everything PySpark.  Just trim off the directory name and assume</tt> </tt>
<a name="L50"></a><tt class="py-lineno">  50</tt>  <tt class="py-line">    <tt class="py-comment"># everything in that tree is PySpark guts.</tt> </tt>
<a name="L51"></a><tt class="py-lineno">  51</tt>  <tt class="py-line">    <tt class="py-name">file</tt><tt class="py-op">,</tt> <tt class="py-name">line</tt><tt class="py-op">,</tt> <tt class="py-name">module</tt><tt class="py-op">,</tt> <tt class="py-name">what</tt> <tt class="py-op">=</tt> <tt class="py-name">tb</tt><tt class="py-op">[</tt><tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">tb</tt><tt class="py-op">)</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt><tt class="py-op">]</tt> </tt>
<a name="L52"></a><tt class="py-lineno">  52</tt>  <tt class="py-line">    <tt class="py-name">sparkpath</tt> <tt class="py-op">=</tt> <tt class="py-name">os</tt><tt class="py-op">.</tt><tt class="py-name">path</tt><tt class="py-op">.</tt><tt class="py-name">dirname</tt><tt class="py-op">(</tt><tt class="py-name">file</tt><tt class="py-op">)</tt> </tt>
<a name="L53"></a><tt class="py-lineno">  53</tt>  <tt class="py-line">    <tt class="py-name">first_spark_frame</tt> <tt class="py-op">=</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">tb</tt><tt class="py-op">)</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt> </tt>
<a name="L54"></a><tt class="py-lineno">  54</tt>  <tt class="py-line">    <tt class="py-keyword">for</tt> <tt class="py-name">i</tt> <tt class="py-keyword">in</tt> <tt class="py-name">range</tt><tt class="py-op">(</tt><tt class="py-number">0</tt><tt class="py-op">,</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">tb</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L55"></a><tt class="py-lineno">  55</tt>  <tt class="py-line">        <tt class="py-name">file</tt><tt class="py-op">,</tt> <tt class="py-name">line</tt><tt class="py-op">,</tt> <tt class="py-name">fun</tt><tt class="py-op">,</tt> <tt class="py-name">what</tt> <tt class="py-op">=</tt> <tt class="py-name">tb</tt><tt class="py-op">[</tt><tt class="py-name">i</tt><tt class="py-op">]</tt> </tt>
<a name="L56"></a><tt class="py-lineno">  56</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">file</tt><tt class="py-op">.</tt><tt class="py-name">startswith</tt><tt class="py-op">(</tt><tt class="py-name">sparkpath</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L57"></a><tt class="py-lineno">  57</tt>  <tt class="py-line">            <tt class="py-name">first_spark_frame</tt> <tt class="py-op">=</tt> <tt class="py-name">i</tt> </tt>
<a name="L58"></a><tt class="py-lineno">  58</tt>  <tt class="py-line">            <tt class="py-keyword">break</tt> </tt>
<a name="L59"></a><tt class="py-lineno">  59</tt>  <tt class="py-line">    <tt class="py-keyword">if</tt> <tt class="py-name">first_spark_frame</tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
<a name="L60"></a><tt class="py-lineno">  60</tt>  <tt class="py-line">        <tt class="py-name">file</tt><tt class="py-op">,</tt> <tt class="py-name">line</tt><tt class="py-op">,</tt> <tt class="py-name">fun</tt><tt class="py-op">,</tt> <tt class="py-name">what</tt> <tt class="py-op">=</tt> <tt class="py-name">tb</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt> </tt>
<a name="L61"></a><tt class="py-lineno">  61</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-string">"%s at %s:%d"</tt> <tt class="py-op">%</tt> <tt class="py-op">(</tt><tt class="py-name">fun</tt><tt class="py-op">,</tt> <tt class="py-name">file</tt><tt class="py-op">,</tt> <tt class="py-name">line</tt><tt class="py-op">)</tt> </tt>
<a name="L62"></a><tt class="py-lineno">  62</tt>  <tt class="py-line">    <tt class="py-name">sfile</tt><tt class="py-op">,</tt> <tt class="py-name">sline</tt><tt class="py-op">,</tt> <tt class="py-name">sfun</tt><tt class="py-op">,</tt> <tt class="py-name">swhat</tt> <tt class="py-op">=</tt> <tt class="py-name">tb</tt><tt class="py-op">[</tt><tt class="py-name">first_spark_frame</tt><tt class="py-op">]</tt> </tt>
<a name="L63"></a><tt class="py-lineno">  63</tt>  <tt class="py-line">    <tt class="py-name">ufile</tt><tt class="py-op">,</tt> <tt class="py-name">uline</tt><tt class="py-op">,</tt> <tt class="py-name">ufun</tt><tt class="py-op">,</tt> <tt class="py-name">uwhat</tt> <tt class="py-op">=</tt> <tt class="py-name">tb</tt><tt class="py-op">[</tt><tt class="py-name">first_spark_frame</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">]</tt> </tt>
<a name="L64"></a><tt class="py-lineno">  64</tt>  <tt class="py-line">    <tt class="py-keyword">return</tt> <tt class="py-string">"%s at %s:%d"</tt> <tt class="py-op">%</tt> <tt class="py-op">(</tt><tt class="py-name">sfun</tt><tt class="py-op">,</tt> <tt class="py-name">ufile</tt><tt class="py-op">,</tt> <tt class="py-name">uline</tt><tt class="py-op">)</tt> </tt>
</div><a name="L65"></a><tt class="py-lineno">  65</tt>  <tt class="py-line"> </tt>
<a name="L66"></a><tt class="py-lineno">  66</tt>  <tt class="py-line"><tt id="link-9" class="py-name" targets="Variable pyspark.rdd._spark_stack_depth=pyspark.rdd-module.html#_spark_stack_depth"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-9', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> <tt class="py-op">=</tt> <tt class="py-number">0</tt> </tt>
<a name="_JavaStackTrace"></a><div id="_JavaStackTrace-def"><a name="L67"></a><tt class="py-lineno">  67</tt>  <tt class="py-line"> </tt>
<a name="L68"></a><tt class="py-lineno">  68</tt> <a class="py-toggle" href="#" id="_JavaStackTrace-toggle" onclick="return toggle('_JavaStackTrace');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.rdd._JavaStackTrace-class.html">_JavaStackTrace</a><tt class="py-op">(</tt><tt class="py-base-class">object</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_JavaStackTrace-collapsed" style="display:none;" pad="++++" indent="++++"></div><div id="_JavaStackTrace-expanded"><a name="_JavaStackTrace.__init__"></a><div id="_JavaStackTrace.__init__-def"><a name="L69"></a><tt class="py-lineno">  69</tt> <a class="py-toggle" href="#" id="_JavaStackTrace.__init__-toggle" onclick="return toggle('_JavaStackTrace.__init__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd._JavaStackTrace-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">sc</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_JavaStackTrace.__init__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="_JavaStackTrace.__init__-expanded"><a name="L70"></a><tt class="py-lineno">  70</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_traceback</tt> <tt class="py-op">=</tt> <tt class="py-name">_extract_concise_traceback</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L71"></a><tt class="py-lineno">  71</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_context</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt> </tt>
</div><a name="L72"></a><tt class="py-lineno">  72</tt>  <tt class="py-line"> </tt>
<a name="_JavaStackTrace.__enter__"></a><div id="_JavaStackTrace.__enter__-def"><a name="L73"></a><tt class="py-lineno">  73</tt> <a class="py-toggle" href="#" id="_JavaStackTrace.__enter__-toggle" onclick="return toggle('_JavaStackTrace.__enter__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd._JavaStackTrace-class.html#__enter__">__enter__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_JavaStackTrace.__enter__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="_JavaStackTrace.__enter__-expanded"><a name="L74"></a><tt class="py-lineno">  74</tt>  <tt class="py-line">        <tt class="py-keyword">global</tt> <tt id="link-10" class="py-name"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-10', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> </tt>
<a name="L75"></a><tt class="py-lineno">  75</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt id="link-11" class="py-name"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-11', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
<a name="L76"></a><tt class="py-lineno">  76</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_context</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt><tt class="py-op">.</tt><tt class="py-name">setCallSite</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_traceback</tt><tt class="py-op">)</tt> </tt>
<a name="L77"></a><tt class="py-lineno">  77</tt>  <tt class="py-line">        <tt id="link-12" class="py-name"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-12', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt>
</div><a name="L78"></a><tt class="py-lineno">  78</tt>  <tt class="py-line"> </tt>
<a name="_JavaStackTrace.__exit__"></a><div id="_JavaStackTrace.__exit__-def"><a name="L79"></a><tt class="py-lineno">  79</tt> <a class="py-toggle" href="#" id="_JavaStackTrace.__exit__-toggle" onclick="return toggle('_JavaStackTrace.__exit__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd._JavaStackTrace-class.html#__exit__">__exit__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">type</tt><tt class="py-op">,</tt> <tt class="py-param">value</tt><tt class="py-op">,</tt> <tt class="py-param">tb</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_JavaStackTrace.__exit__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="_JavaStackTrace.__exit__-expanded"><a name="L80"></a><tt class="py-lineno">  80</tt>  <tt class="py-line">        <tt class="py-keyword">global</tt> <tt id="link-13" class="py-name"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-13', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> </tt>
<a name="L81"></a><tt class="py-lineno">  81</tt>  <tt class="py-line">        <tt id="link-14" class="py-name"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-14', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> <tt class="py-op">-=</tt> <tt class="py-number">1</tt> </tt>
<a name="L82"></a><tt class="py-lineno">  82</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt id="link-15" class="py-name"><a title="pyspark.rdd._spark_stack_depth" class="py-name" href="#" onclick="return doclink('link-15', '_spark_stack_depth', 'link-9');">_spark_stack_depth</a></tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt><tt class="py-op">:</tt> </tt>
<a name="L83"></a><tt class="py-lineno">  83</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_context</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt><tt class="py-op">.</tt><tt class="py-name">setCallSite</tt><tt class="py-op">(</tt><tt class="py-name">None</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L84"></a><tt class="py-lineno">  84</tt>  <tt class="py-line"> </tt>
<a name="RDD"></a><div id="RDD-def"><a name="L85"></a><tt class="py-lineno">  85</tt> <a class="py-toggle" href="#" id="RDD-toggle" onclick="return toggle('RDD');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html">RDD</a><tt class="py-op">(</tt><tt class="py-base-class">object</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD-collapsed" style="display:none;" pad="++++" indent="++++"></div><div id="RDD-expanded"><a name="L86"></a><tt class="py-lineno">  86</tt>  <tt class="py-line">    <tt class="py-docstring">"""</tt> </tt>
<a name="L87"></a><tt class="py-lineno">  87</tt>  <tt class="py-line"><tt class="py-docstring">    A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.</tt> </tt>
<a name="L88"></a><tt class="py-lineno">  88</tt>  <tt class="py-line"><tt class="py-docstring">    Represents an immutable, partitioned collection of elements that can be</tt> </tt>
<a name="L89"></a><tt class="py-lineno">  89</tt>  <tt class="py-line"><tt class="py-docstring">    operated on in parallel.</tt> </tt>
<a name="L90"></a><tt class="py-lineno">  90</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="L91"></a><tt class="py-lineno">  91</tt>  <tt class="py-line"> </tt>
<a name="RDD.__init__"></a><div id="RDD.__init__-def"><a name="L92"></a><tt class="py-lineno">  92</tt> <a class="py-toggle" href="#" id="RDD.__init__-toggle" onclick="return toggle('RDD.__init__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">jrdd</tt><tt class="py-op">,</tt> <tt class="py-param">ctx</tt><tt class="py-op">,</tt> <tt class="py-param">jrdd_deserializer</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.__init__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.__init__-expanded"><a name="L93"></a><tt class="py-lineno">  93</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">jrdd</tt> </tt>
<a name="L94"></a><tt class="py-lineno">  94</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L95"></a><tt class="py-lineno">  95</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L96"></a><tt class="py-lineno">  96</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> <tt class="py-op">=</tt> <tt class="py-name">ctx</tt> </tt>
<a name="L97"></a><tt class="py-lineno">  97</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt> <tt class="py-op">=</tt> <tt class="py-name">jrdd_deserializer</tt> </tt>
</div><a name="L98"></a><tt class="py-lineno">  98</tt>  <tt class="py-line"> </tt>
<a name="RDD.__repr__"></a><div id="RDD.__repr__-def"><a name="L99"></a><tt class="py-lineno">  99</tt> <a class="py-toggle" href="#" id="RDD.__repr__-toggle" onclick="return toggle('RDD.__repr__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#__repr__">__repr__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.__repr__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.__repr__-expanded"><a name="L100"></a><tt class="py-lineno"> 100</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt class="py-name">toString</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L101"></a><tt class="py-lineno"> 101</tt>  <tt class="py-line"> </tt>
<a name="L102"></a><tt class="py-lineno"> 102</tt>  <tt class="py-line">    <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
<a name="RDD.context"></a><div id="RDD.context-def"><a name="L103"></a><tt class="py-lineno"> 103</tt> <a class="py-toggle" href="#" id="RDD.context-toggle" onclick="return toggle('RDD.context');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#context">context</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.context-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.context-expanded"><a name="L104"></a><tt class="py-lineno"> 104</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L105"></a><tt class="py-lineno"> 105</tt>  <tt class="py-line"><tt class="py-docstring">        The L{SparkContext} that this RDD was created on.</tt> </tt>
<a name="L106"></a><tt class="py-lineno"> 106</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L107"></a><tt class="py-lineno"> 107</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> </tt>
</div><a name="L108"></a><tt class="py-lineno"> 108</tt>  <tt class="py-line"> </tt>
<a name="RDD.cache"></a><div id="RDD.cache-def"><a name="L109"></a><tt class="py-lineno"> 109</tt> <a class="py-toggle" href="#" id="RDD.cache-toggle" onclick="return toggle('RDD.cache');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#cache">cache</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.cache-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.cache-expanded"><a name="L110"></a><tt class="py-lineno"> 110</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L111"></a><tt class="py-lineno"> 111</tt>  <tt class="py-line"><tt class="py-docstring">        Persist this RDD with the default storage level (C{MEMORY_ONLY}).</tt> </tt>
<a name="L112"></a><tt class="py-lineno"> 112</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L113"></a><tt class="py-lineno"> 113</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L114"></a><tt class="py-lineno"> 114</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-16" class="py-name" targets="Method pyspark.rdd.RDD.cache()=pyspark.rdd.RDD-class.html#cache"><a title="pyspark.rdd.RDD.cache" class="py-name" href="#" onclick="return doclink('link-16', 'cache', 'link-16');">cache</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L115"></a><tt class="py-lineno"> 115</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
</div><a name="L116"></a><tt class="py-lineno"> 116</tt>  <tt class="py-line"> </tt>
<a name="RDD.persist"></a><div id="RDD.persist-def"><a name="L117"></a><tt class="py-lineno"> 117</tt> <a class="py-toggle" href="#" id="RDD.persist-toggle" onclick="return toggle('RDD.persist');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#persist">persist</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">storageLevel</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.persist-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.persist-expanded"><a name="L118"></a><tt class="py-lineno"> 118</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L119"></a><tt class="py-lineno"> 119</tt>  <tt class="py-line"><tt class="py-docstring">        Set this RDD's storage level to persist its values across operations after the first time</tt> </tt>
<a name="L120"></a><tt class="py-lineno"> 120</tt>  <tt class="py-line"><tt class="py-docstring">        it is computed. This can only be used to assign a new storage level if the RDD does not</tt> </tt>
<a name="L121"></a><tt class="py-lineno"> 121</tt>  <tt class="py-line"><tt class="py-docstring">        have a storage level set yet.</tt> </tt>
<a name="L122"></a><tt class="py-lineno"> 122</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L123"></a><tt class="py-lineno"> 123</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L124"></a><tt class="py-lineno"> 124</tt>  <tt class="py-line">        <tt class="py-name">javaStorageLevel</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_getJavaStorageLevel</tt><tt class="py-op">(</tt><tt class="py-name">storageLevel</tt><tt class="py-op">)</tt> </tt>
<a name="L125"></a><tt class="py-lineno"> 125</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-17" class="py-name" targets="Method pyspark.rdd.RDD.persist()=pyspark.rdd.RDD-class.html#persist"><a title="pyspark.rdd.RDD.persist" class="py-name" href="#" onclick="return doclink('link-17', 'persist', 'link-17');">persist</a></tt><tt class="py-op">(</tt><tt class="py-name">javaStorageLevel</tt><tt class="py-op">)</tt> </tt>
<a name="L126"></a><tt class="py-lineno"> 126</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
</div><a name="L127"></a><tt class="py-lineno"> 127</tt>  <tt class="py-line"> </tt>
<a name="RDD.unpersist"></a><div id="RDD.unpersist-def"><a name="L128"></a><tt class="py-lineno"> 128</tt> <a class="py-toggle" href="#" id="RDD.unpersist-toggle" onclick="return toggle('RDD.unpersist');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#unpersist">unpersist</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.unpersist-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.unpersist-expanded"><a name="L129"></a><tt class="py-lineno"> 129</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L130"></a><tt class="py-lineno"> 130</tt>  <tt class="py-line"><tt class="py-docstring">        Mark the RDD as non-persistent, and remove all blocks for it from memory and disk.</tt> </tt>
<a name="L131"></a><tt class="py-lineno"> 131</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L132"></a><tt class="py-lineno"> 132</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L133"></a><tt class="py-lineno"> 133</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-18" class="py-name" targets="Method pyspark.rdd.RDD.unpersist()=pyspark.rdd.RDD-class.html#unpersist"><a title="pyspark.rdd.RDD.unpersist" class="py-name" href="#" onclick="return doclink('link-18', 'unpersist', 'link-18');">unpersist</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L134"></a><tt class="py-lineno"> 134</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
</div><a name="L135"></a><tt class="py-lineno"> 135</tt>  <tt class="py-line"> </tt>
<a name="RDD.checkpoint"></a><div id="RDD.checkpoint-def"><a name="L136"></a><tt class="py-lineno"> 136</tt> <a class="py-toggle" href="#" id="RDD.checkpoint-toggle" onclick="return toggle('RDD.checkpoint');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#checkpoint">checkpoint</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.checkpoint-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.checkpoint-expanded"><a name="L137"></a><tt class="py-lineno"> 137</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L138"></a><tt class="py-lineno"> 138</tt>  <tt class="py-line"><tt class="py-docstring">        Mark this RDD for checkpointing. It will be saved to a file inside the</tt> </tt>
<a name="L139"></a><tt class="py-lineno"> 139</tt>  <tt class="py-line"><tt class="py-docstring">        checkpoint directory set with L{SparkContext.setCheckpointDir()} and</tt> </tt>
<a name="L140"></a><tt class="py-lineno"> 140</tt>  <tt class="py-line"><tt class="py-docstring">        all references to its parent RDDs will be removed. This function must</tt> </tt>
<a name="L141"></a><tt class="py-lineno"> 141</tt>  <tt class="py-line"><tt class="py-docstring">        be called before any job has been executed on this RDD. It is strongly</tt> </tt>
<a name="L142"></a><tt class="py-lineno"> 142</tt>  <tt class="py-line"><tt class="py-docstring">        recommended that this RDD is persisted in memory, otherwise saving it</tt> </tt>
<a name="L143"></a><tt class="py-lineno"> 143</tt>  <tt class="py-line"><tt class="py-docstring">        on a file will require recomputation.</tt> </tt>
<a name="L144"></a><tt class="py-lineno"> 144</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L145"></a><tt class="py-lineno"> 145</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L146"></a><tt class="py-lineno"> 146</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-19" class="py-name" targets="Module pyspark.rdd=pyspark.rdd-module.html"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-19', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-20" class="py-name" targets="Method pyspark.rdd.RDD.checkpoint()=pyspark.rdd.RDD-class.html#checkpoint"><a title="pyspark.rdd.RDD.checkpoint" class="py-name" href="#" onclick="return doclink('link-20', 'checkpoint', 'link-20');">checkpoint</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L147"></a><tt class="py-lineno"> 147</tt>  <tt class="py-line"> </tt>
<a name="RDD.isCheckpointed"></a><div id="RDD.isCheckpointed-def"><a name="L148"></a><tt class="py-lineno"> 148</tt> <a class="py-toggle" href="#" id="RDD.isCheckpointed-toggle" onclick="return toggle('RDD.isCheckpointed');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#isCheckpointed">isCheckpointed</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.isCheckpointed-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.isCheckpointed-expanded"><a name="L149"></a><tt class="py-lineno"> 149</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L150"></a><tt class="py-lineno"> 150</tt>  <tt class="py-line"><tt class="py-docstring">        Return whether this RDD has been checkpointed or not</tt> </tt>
<a name="L151"></a><tt class="py-lineno"> 151</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L152"></a><tt class="py-lineno"> 152</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-21" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-21', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-22" class="py-name" targets="Method pyspark.rdd.RDD.isCheckpointed()=pyspark.rdd.RDD-class.html#isCheckpointed"><a title="pyspark.rdd.RDD.isCheckpointed" class="py-name" href="#" onclick="return doclink('link-22', 'isCheckpointed', 'link-22');">isCheckpointed</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L153"></a><tt class="py-lineno"> 153</tt>  <tt class="py-line"> </tt>
<a name="RDD.getCheckpointFile"></a><div id="RDD.getCheckpointFile-def"><a name="L154"></a><tt class="py-lineno"> 154</tt> <a class="py-toggle" href="#" id="RDD.getCheckpointFile-toggle" onclick="return toggle('RDD.getCheckpointFile');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#getCheckpointFile">getCheckpointFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.getCheckpointFile-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.getCheckpointFile-expanded"><a name="L155"></a><tt class="py-lineno"> 155</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L156"></a><tt class="py-lineno"> 156</tt>  <tt class="py-line"><tt class="py-docstring">        Gets the name of the file to which this RDD was checkpointed</tt> </tt>
<a name="L157"></a><tt class="py-lineno"> 157</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L158"></a><tt class="py-lineno"> 158</tt>  <tt class="py-line">        <tt class="py-name">checkpointFile</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-23" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-23', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-24" class="py-name" targets="Method pyspark.rdd.RDD.getCheckpointFile()=pyspark.rdd.RDD-class.html#getCheckpointFile"><a title="pyspark.rdd.RDD.getCheckpointFile" class="py-name" href="#" onclick="return doclink('link-24', 'getCheckpointFile', 'link-24');">getCheckpointFile</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L159"></a><tt class="py-lineno"> 159</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">checkpointFile</tt><tt class="py-op">.</tt><tt class="py-name">isDefined</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L160"></a><tt class="py-lineno"> 160</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">checkpointFile</tt><tt class="py-op">.</tt><tt id="link-25" class="py-name" targets="Method pyspark.conf.SparkConf.get()=pyspark.conf.SparkConf-class.html#get,Class Method pyspark.files.SparkFiles.get()=pyspark.files.SparkFiles-class.html#get"><a title="pyspark.conf.SparkConf.get
pyspark.files.SparkFiles.get" class="py-name" href="#" onclick="return doclink('link-25', 'get', 'link-25');">get</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L161"></a><tt class="py-lineno"> 161</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L162"></a><tt class="py-lineno"> 162</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">None</tt> </tt>
</div><a name="L163"></a><tt class="py-lineno"> 163</tt>  <tt class="py-line"> </tt>
<a name="RDD.map"></a><div id="RDD.map-def"><a name="L164"></a><tt class="py-lineno"> 164</tt> <a class="py-toggle" href="#" id="RDD.map-toggle" onclick="return toggle('RDD.map');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#map">map</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.map-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.map-expanded"><a name="L165"></a><tt class="py-lineno"> 165</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L166"></a><tt class="py-lineno"> 166</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD containing the distinct elements in this RDD.</tt> </tt>
<a name="L167"></a><tt class="py-lineno"> 167</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L168"></a><tt class="py-lineno"> 168</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">imap</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
<a name="L169"></a><tt class="py-lineno"> 169</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
</div><a name="L170"></a><tt class="py-lineno"> 170</tt>  <tt class="py-line"> </tt>
<a name="RDD.flatMap"></a><div id="RDD.flatMap-def"><a name="L171"></a><tt class="py-lineno"> 171</tt> <a class="py-toggle" href="#" id="RDD.flatMap-toggle" onclick="return toggle('RDD.flatMap');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#flatMap">flatMap</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.flatMap-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.flatMap-expanded"><a name="L172"></a><tt class="py-lineno"> 172</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L173"></a><tt class="py-lineno"> 173</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD by first applying a function to all elements of this</tt> </tt>
<a name="L174"></a><tt class="py-lineno"> 174</tt>  <tt class="py-line"><tt class="py-docstring">        RDD, and then flattening the results.</tt> </tt>
<a name="L175"></a><tt class="py-lineno"> 175</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L176"></a><tt class="py-lineno"> 176</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([2, 3, 4])</tt> </tt>
<a name="L177"></a><tt class="py-lineno"> 177</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.flatMap(lambda x: range(1, x)).collect())</tt> </tt>
<a name="L178"></a><tt class="py-lineno"> 178</tt>  <tt class="py-line"><tt class="py-docstring">        [1, 1, 1, 2, 2, 3]</tt> </tt>
<a name="L179"></a><tt class="py-lineno"> 179</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.flatMap(lambda x: [(x, x), (x, x)]).collect())</tt> </tt>
<a name="L180"></a><tt class="py-lineno"> 180</tt>  <tt class="py-line"><tt class="py-docstring">        [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]</tt> </tt>
<a name="L181"></a><tt class="py-lineno"> 181</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L182"></a><tt class="py-lineno"> 182</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">s</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">chain</tt><tt class="py-op">.</tt><tt class="py-name">from_iterable</tt><tt class="py-op">(</tt><tt class="py-name">imap</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L183"></a><tt class="py-lineno"> 183</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-26" class="py-name" targets="Method pyspark.rdd.RDD.mapPartitionsWithIndex()=pyspark.rdd.RDD-class.html#mapPartitionsWithIndex"><a title="pyspark.rdd.RDD.mapPartitionsWithIndex" class="py-name" href="#" onclick="return doclink('link-26', 'mapPartitionsWithIndex', 'link-26');">mapPartitionsWithIndex</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
</div><a name="L184"></a><tt class="py-lineno"> 184</tt>  <tt class="py-line"> </tt>
<a name="RDD.mapPartitions"></a><div id="RDD.mapPartitions-def"><a name="L185"></a><tt class="py-lineno"> 185</tt> <a class="py-toggle" href="#" id="RDD.mapPartitions-toggle" onclick="return toggle('RDD.mapPartitions');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapPartitions">mapPartitions</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.mapPartitions-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.mapPartitions-expanded"><a name="L186"></a><tt class="py-lineno"> 186</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L187"></a><tt class="py-lineno"> 187</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD by applying a function to each partition of this RDD.</tt> </tt>
<a name="L188"></a><tt class="py-lineno"> 188</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L189"></a><tt class="py-lineno"> 189</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 2)</tt> </tt>
<a name="L190"></a><tt class="py-lineno"> 190</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; def f(iterator): yield sum(iterator)</tt> </tt>
<a name="L191"></a><tt class="py-lineno"> 191</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd.mapPartitions(f).collect()</tt> </tt>
<a name="L192"></a><tt class="py-lineno"> 192</tt>  <tt class="py-line"><tt class="py-docstring">        [3, 7]</tt> </tt>
<a name="L193"></a><tt class="py-lineno"> 193</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L194"></a><tt class="py-lineno"> 194</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">s</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
<a name="L195"></a><tt class="py-lineno"> 195</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-27" class="py-name"><a title="pyspark.rdd.RDD.mapPartitionsWithIndex" class="py-name" href="#" onclick="return doclink('link-27', 'mapPartitionsWithIndex', 'link-26');">mapPartitionsWithIndex</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
</div><a name="L196"></a><tt class="py-lineno"> 196</tt>  <tt class="py-line"> </tt>
<a name="RDD.mapPartitionsWithIndex"></a><div id="RDD.mapPartitionsWithIndex-def"><a name="L197"></a><tt class="py-lineno"> 197</tt> <a class="py-toggle" href="#" id="RDD.mapPartitionsWithIndex-toggle" onclick="return toggle('RDD.mapPartitionsWithIndex');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapPartitionsWithIndex">mapPartitionsWithIndex</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.mapPartitionsWithIndex-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.mapPartitionsWithIndex-expanded"><a name="L198"></a><tt class="py-lineno"> 198</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L199"></a><tt class="py-lineno"> 199</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD by applying a function to each partition of this RDD,</tt> </tt>
<a name="L200"></a><tt class="py-lineno"> 200</tt>  <tt class="py-line"><tt class="py-docstring">        while tracking the index of the original partition.</tt> </tt>
<a name="L201"></a><tt class="py-lineno"> 201</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L202"></a><tt class="py-lineno"> 202</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 4)</tt> </tt>
<a name="L203"></a><tt class="py-lineno"> 203</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; def f(splitIndex, iterator): yield splitIndex</tt> </tt>
<a name="L204"></a><tt class="py-lineno"> 204</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd.mapPartitionsWithIndex(f).sum()</tt> </tt>
<a name="L205"></a><tt class="py-lineno"> 205</tt>  <tt class="py-line"><tt class="py-docstring">        6</tt> </tt>
<a name="L206"></a><tt class="py-lineno"> 206</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L207"></a><tt class="py-lineno"> 207</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
</div><a name="L208"></a><tt class="py-lineno"> 208</tt>  <tt class="py-line"> </tt>
<a name="RDD.mapPartitionsWithSplit"></a><div id="RDD.mapPartitionsWithSplit-def"><a name="L209"></a><tt class="py-lineno"> 209</tt> <a class="py-toggle" href="#" id="RDD.mapPartitionsWithSplit-toggle" onclick="return toggle('RDD.mapPartitionsWithSplit');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapPartitionsWithSplit">mapPartitionsWithSplit</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.mapPartitionsWithSplit-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.mapPartitionsWithSplit-expanded"><a name="L210"></a><tt class="py-lineno"> 210</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L211"></a><tt class="py-lineno"> 211</tt>  <tt class="py-line"><tt class="py-docstring">        Deprecated: use mapPartitionsWithIndex instead.</tt> </tt>
<a name="L212"></a><tt class="py-lineno"> 212</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L213"></a><tt class="py-lineno"> 213</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD by applying a function to each partition of this RDD,</tt> </tt>
<a name="L214"></a><tt class="py-lineno"> 214</tt>  <tt class="py-line"><tt class="py-docstring">        while tracking the index of the original partition.</tt> </tt>
<a name="L215"></a><tt class="py-lineno"> 215</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L216"></a><tt class="py-lineno"> 216</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 4)</tt> </tt>
<a name="L217"></a><tt class="py-lineno"> 217</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; def f(splitIndex, iterator): yield splitIndex</tt> </tt>
<a name="L218"></a><tt class="py-lineno"> 218</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd.mapPartitionsWithSplit(f).sum()</tt> </tt>
<a name="L219"></a><tt class="py-lineno"> 219</tt>  <tt class="py-line"><tt class="py-docstring">        6</tt> </tt>
<a name="L220"></a><tt class="py-lineno"> 220</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L221"></a><tt class="py-lineno"> 221</tt>  <tt class="py-line">        <tt class="py-name">warnings</tt><tt class="py-op">.</tt><tt class="py-name">warn</tt><tt class="py-op">(</tt><tt class="py-string">"mapPartitionsWithSplit is deprecated; "</tt> </tt>
<a name="L222"></a><tt class="py-lineno"> 222</tt>  <tt class="py-line">            <tt class="py-string">"use mapPartitionsWithIndex instead"</tt><tt class="py-op">,</tt> <tt class="py-name">DeprecationWarning</tt><tt class="py-op">,</tt> <tt class="py-name">stacklevel</tt><tt class="py-op">=</tt><tt class="py-number">2</tt><tt class="py-op">)</tt> </tt>
<a name="L223"></a><tt class="py-lineno"> 223</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-28" class="py-name"><a title="pyspark.rdd.RDD.mapPartitionsWithIndex" class="py-name" href="#" onclick="return doclink('link-28', 'mapPartitionsWithIndex', 'link-26');">mapPartitionsWithIndex</a></tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">)</tt> </tt>
</div><a name="L224"></a><tt class="py-lineno"> 224</tt>  <tt class="py-line"> </tt>
<a name="RDD.filter"></a><div id="RDD.filter-def"><a name="L225"></a><tt class="py-lineno"> 225</tt> <a class="py-toggle" href="#" id="RDD.filter-toggle" onclick="return toggle('RDD.filter');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#filter">filter</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.filter-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.filter-expanded"><a name="L226"></a><tt class="py-lineno"> 226</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L227"></a><tt class="py-lineno"> 227</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD containing only the elements that satisfy a predicate.</tt> </tt>
<a name="L228"></a><tt class="py-lineno"> 228</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L229"></a><tt class="py-lineno"> 229</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4, 5])</tt> </tt>
<a name="L230"></a><tt class="py-lineno"> 230</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd.filter(lambda x: x % 2 == 0).collect()</tt> </tt>
<a name="L231"></a><tt class="py-lineno"> 231</tt>  <tt class="py-line"><tt class="py-docstring">        [2, 4]</tt> </tt>
<a name="L232"></a><tt class="py-lineno"> 232</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L233"></a><tt class="py-lineno"> 233</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">return</tt> <tt class="py-name">ifilter</tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
<a name="L234"></a><tt class="py-lineno"> 234</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-29" class="py-name" targets="Method pyspark.rdd.RDD.mapPartitions()=pyspark.rdd.RDD-class.html#mapPartitions"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-29', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
</div><a name="L235"></a><tt class="py-lineno"> 235</tt>  <tt class="py-line"> </tt>
<a name="RDD.distinct"></a><div id="RDD.distinct-def"><a name="L236"></a><tt class="py-lineno"> 236</tt> <a class="py-toggle" href="#" id="RDD.distinct-toggle" onclick="return toggle('RDD.distinct');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#distinct">distinct</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.distinct-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.distinct-expanded"><a name="L237"></a><tt class="py-lineno"> 237</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L238"></a><tt class="py-lineno"> 238</tt>  <tt class="py-line"><tt class="py-docstring">        Return a new RDD containing the distinct elements in this RDD.</tt> </tt>
<a name="L239"></a><tt class="py-lineno"> 239</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L240"></a><tt class="py-lineno"> 240</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())</tt> </tt>
<a name="L241"></a><tt class="py-lineno"> 241</tt>  <tt class="py-line"><tt class="py-docstring">        [1, 2, 3]</tt> </tt>
<a name="L242"></a><tt class="py-lineno"> 242</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L243"></a><tt class="py-lineno"> 243</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-30" class="py-name" targets="Method pyspark.rdd.RDD.map()=pyspark.rdd.RDD-class.html#map"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-30', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> \ </tt>
<a name="L244"></a><tt class="py-lineno"> 244</tt>  <tt class="py-line">                   <tt class="py-op">.</tt><tt id="link-31" class="py-name" targets="Method pyspark.rdd.RDD.reduceByKey()=pyspark.rdd.RDD-class.html#reduceByKey"><a title="pyspark.rdd.RDD.reduceByKey" class="py-name" href="#" onclick="return doclink('link-31', 'reduceByKey', 'link-31');">reduceByKey</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">_</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt> \ </tt>
<a name="L245"></a><tt class="py-lineno"> 245</tt>  <tt class="py-line">                   <tt class="py-op">.</tt><tt id="link-32" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-32', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">_</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
</div><a name="L246"></a><tt class="py-lineno"> 246</tt>  <tt class="py-line"> </tt>
<a name="RDD.sample"></a><div id="RDD.sample-def"><a name="L247"></a><tt class="py-lineno"> 247</tt> <a class="py-toggle" href="#" id="RDD.sample-toggle" onclick="return toggle('RDD.sample');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#sample">sample</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">withReplacement</tt><tt class="py-op">,</tt> <tt class="py-param">fraction</tt><tt class="py-op">,</tt> <tt class="py-param">seed</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.sample-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.sample-expanded"><a name="L248"></a><tt class="py-lineno"> 248</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L249"></a><tt class="py-lineno"> 249</tt>  <tt class="py-line"><tt class="py-docstring">        Return a sampled subset of this RDD (relies on numpy and falls back</tt> </tt>
<a name="L250"></a><tt class="py-lineno"> 250</tt>  <tt class="py-line"><tt class="py-docstring">        on default random generator if numpy is unavailable).</tt> </tt>
<a name="L251"></a><tt class="py-lineno"> 251</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L252"></a><tt class="py-lineno"> 252</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP</tt> </tt>
<a name="L253"></a><tt class="py-lineno"> 253</tt>  <tt class="py-line"><tt class="py-docstring">        [2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]</tt> </tt>
<a name="L254"></a><tt class="py-lineno"> 254</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L255"></a><tt class="py-lineno"> 255</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-33" class="py-name"><a title="pyspark.rdd.RDD.mapPartitionsWithIndex" class="py-name" href="#" onclick="return doclink('link-33', 'mapPartitionsWithIndex', 'link-26');">mapPartitionsWithIndex</a></tt><tt class="py-op">(</tt><tt class="py-name">RDDSampler</tt><tt class="py-op">(</tt><tt class="py-name">withReplacement</tt><tt class="py-op">,</tt> <tt class="py-name">fraction</tt><tt class="py-op">,</tt> <tt class="py-name">seed</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
</div><a name="L256"></a><tt class="py-lineno"> 256</tt>  <tt class="py-line"> </tt>
<a name="L257"></a><tt class="py-lineno"> 257</tt>  <tt class="py-line">    <tt class="py-comment"># this is ported from scala/spark/RDD.scala</tt> </tt>
<a name="RDD.takeSample"></a><div id="RDD.takeSample-def"><a name="L258"></a><tt class="py-lineno"> 258</tt> <a class="py-toggle" href="#" id="RDD.takeSample-toggle" onclick="return toggle('RDD.takeSample');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#takeSample">takeSample</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">withReplacement</tt><tt class="py-op">,</tt> <tt class="py-param">num</tt><tt class="py-op">,</tt> <tt class="py-param">seed</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.takeSample-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.takeSample-expanded"><a name="L259"></a><tt class="py-lineno"> 259</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L260"></a><tt class="py-lineno"> 260</tt>  <tt class="py-line"><tt class="py-docstring">        Return a fixed-size sampled subset of this RDD (currently requires numpy).</tt> </tt>
<a name="L261"></a><tt class="py-lineno"> 261</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L262"></a><tt class="py-lineno"> 262</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize(range(0, 10)).takeSample(True, 10, 1) #doctest: +SKIP</tt> </tt>
<a name="L263"></a><tt class="py-lineno"> 263</tt>  <tt class="py-line"><tt class="py-docstring">        [4, 2, 1, 8, 2, 7, 0, 4, 1, 4]</tt> </tt>
<a name="L264"></a><tt class="py-lineno"> 264</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L265"></a><tt class="py-lineno"> 265</tt>  <tt class="py-line"> </tt>
<a name="L266"></a><tt class="py-lineno"> 266</tt>  <tt class="py-line">        <tt class="py-name">fraction</tt> <tt class="py-op">=</tt> <tt class="py-number">0.0</tt> </tt>
<a name="L267"></a><tt class="py-lineno"> 267</tt>  <tt class="py-line">        <tt class="py-name">total</tt> <tt class="py-op">=</tt> <tt class="py-number">0</tt> </tt>
<a name="L268"></a><tt class="py-lineno"> 268</tt>  <tt class="py-line">        <tt class="py-name">multiplier</tt> <tt class="py-op">=</tt> <tt class="py-number">3.0</tt> </tt>
<a name="L269"></a><tt class="py-lineno"> 269</tt>  <tt class="py-line">        <tt class="py-name">initialCount</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-34" class="py-name" targets="Method pyspark.rdd.RDD.count()=pyspark.rdd.RDD-class.html#count,Method pyspark.statcounter.StatCounter.count()=pyspark.statcounter.StatCounter-class.html#count"><a title="pyspark.rdd.RDD.count
pyspark.statcounter.StatCounter.count" class="py-name" href="#" onclick="return doclink('link-34', 'count', 'link-34');">count</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L270"></a><tt class="py-lineno"> 270</tt>  <tt class="py-line">        <tt class="py-name">maxSelected</tt> <tt class="py-op">=</tt> <tt class="py-number">0</tt> </tt>
<a name="L271"></a><tt class="py-lineno"> 271</tt>  <tt class="py-line"> </tt>
<a name="L272"></a><tt class="py-lineno"> 272</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-op">(</tt><tt class="py-name">num</tt> <tt class="py-op">&lt;</tt> <tt class="py-number">0</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L273"></a><tt class="py-lineno"> 273</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">ValueError</tt> </tt>
<a name="L274"></a><tt class="py-lineno"> 274</tt>  <tt class="py-line"> </tt>
<a name="L275"></a><tt class="py-lineno"> 275</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">initialCount</tt> <tt class="py-op">&gt;</tt> <tt class="py-name">sys</tt><tt class="py-op">.</tt><tt class="py-name">maxint</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
<a name="L276"></a><tt class="py-lineno"> 276</tt>  <tt class="py-line">            <tt class="py-name">maxSelected</tt> <tt class="py-op">=</tt> <tt class="py-name">sys</tt><tt class="py-op">.</tt><tt class="py-name">maxint</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt> </tt>
<a name="L277"></a><tt class="py-lineno"> 277</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L278"></a><tt class="py-lineno"> 278</tt>  <tt class="py-line">            <tt class="py-name">maxSelected</tt> <tt class="py-op">=</tt> <tt class="py-name">initialCount</tt> </tt>
<a name="L279"></a><tt class="py-lineno"> 279</tt>  <tt class="py-line"> </tt>
<a name="L280"></a><tt class="py-lineno"> 280</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">num</tt> <tt class="py-op">&gt;</tt> <tt class="py-name">initialCount</tt> <tt class="py-keyword">and</tt> <tt class="py-keyword">not</tt> <tt class="py-name">withReplacement</tt><tt class="py-op">:</tt> </tt>
<a name="L281"></a><tt class="py-lineno"> 281</tt>  <tt class="py-line">            <tt class="py-name">total</tt> <tt class="py-op">=</tt> <tt class="py-name">maxSelected</tt> </tt>
<a name="L282"></a><tt class="py-lineno"> 282</tt>  <tt class="py-line">            <tt class="py-name">fraction</tt> <tt class="py-op">=</tt> <tt class="py-name">multiplier</tt> <tt class="py-op">*</tt> <tt class="py-op">(</tt><tt class="py-name">maxSelected</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> <tt class="py-op">/</tt> <tt class="py-name">initialCount</tt> </tt>
<a name="L283"></a><tt class="py-lineno"> 283</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L284"></a><tt class="py-lineno"> 284</tt>  <tt class="py-line">            <tt class="py-name">fraction</tt> <tt class="py-op">=</tt> <tt class="py-name">multiplier</tt> <tt class="py-op">*</tt> <tt class="py-op">(</tt><tt class="py-name">num</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> <tt class="py-op">/</tt> <tt class="py-name">initialCount</tt> </tt>
<a name="L285"></a><tt class="py-lineno"> 285</tt>  <tt class="py-line">            <tt class="py-name">total</tt> <tt class="py-op">=</tt> <tt class="py-name">num</tt> </tt>
<a name="L286"></a><tt class="py-lineno"> 286</tt>  <tt class="py-line"> </tt>
<a name="L287"></a><tt class="py-lineno"> 287</tt>  <tt class="py-line">        <tt class="py-name">samples</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-35" class="py-name" targets="Method pyspark.rdd.RDD.sample()=pyspark.rdd.RDD-class.html#sample"><a title="pyspark.rdd.RDD.sample" class="py-name" href="#" onclick="return doclink('link-35', 'sample', 'link-35');">sample</a></tt><tt class="py-op">(</tt><tt class="py-name">withReplacement</tt><tt class="py-op">,</tt> <tt class="py-name">fraction</tt><tt class="py-op">,</tt> <tt class="py-name">seed</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-36" class="py-name" targets="Method pyspark.rdd.RDD.collect()=pyspark.rdd.RDD-class.html#collect"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-36', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L288"></a><tt class="py-lineno"> 288</tt>  <tt class="py-line"> </tt>
<a name="L289"></a><tt class="py-lineno"> 289</tt>  <tt class="py-line">        <tt class="py-comment"># If the first sample didn't turn out large enough, keep trying to take samples;</tt> </tt>
<a name="L290"></a><tt class="py-lineno"> 290</tt>  <tt class="py-line">        <tt class="py-comment"># this shouldn't happen often because we use a big multiplier for their initial size.</tt> </tt>
<a name="L291"></a><tt class="py-lineno"> 291</tt>  <tt class="py-line">        <tt class="py-comment"># See: scala/spark/RDD.scala</tt> </tt>
<a name="L292"></a><tt class="py-lineno"> 292</tt>  <tt class="py-line">        <tt class="py-keyword">while</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">samples</tt><tt class="py-op">)</tt> <tt class="py-op">&lt;</tt> <tt class="py-name">total</tt><tt class="py-op">:</tt> </tt>
<a name="L293"></a><tt class="py-lineno"> 293</tt>  <tt class="py-line">            <tt class="py-keyword">if</tt> <tt class="py-name">seed</tt> <tt class="py-op">&gt;</tt> <tt class="py-name">sys</tt><tt class="py-op">.</tt><tt class="py-name">maxint</tt> <tt class="py-op">-</tt> <tt class="py-number">2</tt><tt class="py-op">:</tt> </tt>
<a name="L294"></a><tt class="py-lineno"> 294</tt>  <tt class="py-line">                <tt class="py-name">seed</tt> <tt class="py-op">=</tt> <tt class="py-op">-</tt><tt class="py-number">1</tt> </tt>
<a name="L295"></a><tt class="py-lineno"> 295</tt>  <tt class="py-line">            <tt class="py-name">seed</tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt>
<a name="L296"></a><tt class="py-lineno"> 296</tt>  <tt class="py-line">            <tt class="py-name">samples</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-37" class="py-name"><a title="pyspark.rdd.RDD.sample" class="py-name" href="#" onclick="return doclink('link-37', 'sample', 'link-35');">sample</a></tt><tt class="py-op">(</tt><tt class="py-name">withReplacement</tt><tt class="py-op">,</tt> <tt class="py-name">fraction</tt><tt class="py-op">,</tt> <tt class="py-name">seed</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-38" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-38', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L297"></a><tt class="py-lineno"> 297</tt>  <tt class="py-line"> </tt>
<a name="L298"></a><tt class="py-lineno"> 298</tt>  <tt class="py-line">        <tt class="py-name">sampler</tt> <tt class="py-op">=</tt> <tt class="py-name">RDDSampler</tt><tt class="py-op">(</tt><tt class="py-name">withReplacement</tt><tt class="py-op">,</tt> <tt class="py-name">fraction</tt><tt class="py-op">,</tt> <tt class="py-name">seed</tt><tt class="py-op">+</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
<a name="L299"></a><tt class="py-lineno"> 299</tt>  <tt class="py-line">        <tt class="py-name">sampler</tt><tt class="py-op">.</tt><tt class="py-name">shuffle</tt><tt class="py-op">(</tt><tt class="py-name">samples</tt><tt class="py-op">)</tt> </tt>
<a name="L300"></a><tt class="py-lineno"> 300</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">samples</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">:</tt><tt class="py-name">total</tt><tt class="py-op">]</tt> </tt>
</div><a name="L301"></a><tt class="py-lineno"> 301</tt>  <tt class="py-line"> </tt>
<a name="RDD.union"></a><div id="RDD.union-def"><a name="L302"></a><tt class="py-lineno"> 302</tt> <a class="py-toggle" href="#" id="RDD.union-toggle" onclick="return toggle('RDD.union');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#union">union</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.union-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.union-expanded"><a name="L303"></a><tt class="py-lineno"> 303</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L304"></a><tt class="py-lineno"> 304</tt>  <tt class="py-line"><tt class="py-docstring">        Return the union of this RDD and another one.</tt> </tt>
<a name="L305"></a><tt class="py-lineno"> 305</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L306"></a><tt class="py-lineno"> 306</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 1, 2, 3])</tt> </tt>
<a name="L307"></a><tt class="py-lineno"> 307</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd.union(rdd).collect()</tt> </tt>
<a name="L308"></a><tt class="py-lineno"> 308</tt>  <tt class="py-line"><tt class="py-docstring">        [1, 1, 2, 3, 1, 1, 2, 3]</tt> </tt>
<a name="L309"></a><tt class="py-lineno"> 309</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L310"></a><tt class="py-lineno"> 310</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt> <tt class="py-op">==</tt> <tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt><tt class="py-op">:</tt> </tt>
<a name="L311"></a><tt class="py-lineno"> 311</tt>  <tt class="py-line">            <tt id="link-39" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-39', 'rdd', 'link-19');">rdd</a></tt> <tt class="py-op">=</tt> <tt id="link-40" class="py-name" targets="Class pyspark.rdd.RDD=pyspark.rdd.RDD-class.html"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-40', 'RDD', 'link-40');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-41" class="py-name" targets="Method pyspark.context.SparkContext.union()=pyspark.context.SparkContext-class.html#union,Method pyspark.rdd.RDD.union()=pyspark.rdd.RDD-class.html#union"><a title="pyspark.context.SparkContext.union
pyspark.rdd.RDD.union" class="py-name" href="#" onclick="return doclink('link-41', 'union', 'link-41');">union</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">,</tt> </tt>
<a name="L312"></a><tt class="py-lineno"> 312</tt>  <tt class="py-line">                      <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt><tt class="py-op">)</tt> </tt>
<a name="L313"></a><tt class="py-lineno"> 313</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt id="link-42" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-42', 'rdd', 'link-19');">rdd</a></tt> </tt>
<a name="L314"></a><tt class="py-lineno"> 314</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L315"></a><tt class="py-lineno"> 315</tt>  <tt class="py-line">            <tt class="py-comment"># These RDDs contain data in different serialized formats, so we</tt> </tt>
<a name="L316"></a><tt class="py-lineno"> 316</tt>  <tt class="py-line">            <tt class="py-comment"># must normalize them to the default serializer.</tt> </tt>
<a name="L317"></a><tt class="py-lineno"> 317</tt>  <tt class="py-line">            <tt class="py-name">self_copy</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_reserialize</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L318"></a><tt class="py-lineno"> 318</tt>  <tt class="py-line">            <tt class="py-name">other_copy</tt> <tt class="py-op">=</tt> <tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_reserialize</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L319"></a><tt class="py-lineno"> 319</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt id="link-43" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-43', 'RDD', 'link-40');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self_copy</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-44" class="py-name"><a title="pyspark.context.SparkContext.union
pyspark.rdd.RDD.union" class="py-name" href="#" onclick="return doclink('link-44', 'union', 'link-41');">union</a></tt><tt class="py-op">(</tt><tt class="py-name">other_copy</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">,</tt> </tt>
<a name="L320"></a><tt class="py-lineno"> 320</tt>  <tt class="py-line">                       <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">serializer</tt><tt class="py-op">)</tt> </tt>
</div><a name="L321"></a><tt class="py-lineno"> 321</tt>  <tt class="py-line"> </tt>
<a name="RDD._reserialize"></a><div id="RDD._reserialize-def"><a name="L322"></a><tt class="py-lineno"> 322</tt> <a class="py-toggle" href="#" id="RDD._reserialize-toggle" onclick="return toggle('RDD._reserialize');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#_reserialize">_reserialize</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD._reserialize-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD._reserialize-expanded"><a name="L323"></a><tt class="py-lineno"> 323</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt> <tt class="py-op">==</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">serializer</tt><tt class="py-op">:</tt> </tt>
<a name="L324"></a><tt class="py-lineno"> 324</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
<a name="L325"></a><tt class="py-lineno"> 325</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L326"></a><tt class="py-lineno"> 326</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-45" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-45', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
</div><a name="L327"></a><tt class="py-lineno"> 327</tt>  <tt class="py-line"> </tt>
<a name="RDD.__add__"></a><div id="RDD.__add__-def"><a name="L328"></a><tt class="py-lineno"> 328</tt> <a class="py-toggle" href="#" id="RDD.__add__-toggle" onclick="return toggle('RDD.__add__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#__add__">__add__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.__add__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.__add__-expanded"><a name="L329"></a><tt class="py-lineno"> 329</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L330"></a><tt class="py-lineno"> 330</tt>  <tt class="py-line"><tt class="py-docstring">        Return the union of this RDD and another one.</tt> </tt>
<a name="L331"></a><tt class="py-lineno"> 331</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L332"></a><tt class="py-lineno"> 332</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 1, 2, 3])</tt> </tt>
<a name="L333"></a><tt class="py-lineno"> 333</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; (rdd + rdd).collect()</tt> </tt>
<a name="L334"></a><tt class="py-lineno"> 334</tt>  <tt class="py-line"><tt class="py-docstring">        [1, 1, 2, 3, 1, 1, 2, 3]</tt> </tt>
<a name="L335"></a><tt class="py-lineno"> 335</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L336"></a><tt class="py-lineno"> 336</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">,</tt> <tt id="link-46" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-46', 'RDD', 'link-40');">RDD</a></tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L337"></a><tt class="py-lineno"> 337</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">TypeError</tt> </tt>
<a name="L338"></a><tt class="py-lineno"> 338</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-47" class="py-name"><a title="pyspark.context.SparkContext.union
pyspark.rdd.RDD.union" class="py-name" href="#" onclick="return doclink('link-47', 'union', 'link-41');">union</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">)</tt> </tt>
</div><a name="L339"></a><tt class="py-lineno"> 339</tt>  <tt class="py-line"> </tt>
<a name="RDD.sortByKey"></a><div id="RDD.sortByKey-def"><a name="L340"></a><tt class="py-lineno"> 340</tt> <a class="py-toggle" href="#" id="RDD.sortByKey-toggle" onclick="return toggle('RDD.sortByKey');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#sortByKey">sortByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">ascending</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">,</tt> <tt class="py-param">keyfunc</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.sortByKey-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.sortByKey-expanded"><a name="L341"></a><tt class="py-lineno"> 341</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L342"></a><tt class="py-lineno"> 342</tt>  <tt class="py-line"><tt class="py-docstring">        Sorts this RDD, which is assumed to consist of (key, value) pairs.</tt> </tt>
<a name="L343"></a><tt class="py-lineno"> 343</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L344"></a><tt class="py-lineno"> 344</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]</tt> </tt>
<a name="L345"></a><tt class="py-lineno"> 345</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize(tmp).sortByKey(True, 2).collect()</tt> </tt>
<a name="L346"></a><tt class="py-lineno"> 346</tt>  <tt class="py-line"><tt class="py-docstring">        [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]</tt> </tt>
<a name="L347"></a><tt class="py-lineno"> 347</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]</tt> </tt>
<a name="L348"></a><tt class="py-lineno"> 348</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])</tt> </tt>
<a name="L349"></a><tt class="py-lineno"> 349</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()</tt> </tt>
<a name="L350"></a><tt class="py-lineno"> 350</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5), ('little', 4), ('Mary', 1), ('was', 8), ('white', 9), ('whose', 6)]</tt> </tt>
<a name="L351"></a><tt class="py-lineno"> 351</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L352"></a><tt class="py-lineno"> 352</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
<a name="L353"></a><tt class="py-lineno"> 353</tt>  <tt class="py-line">            <tt class="py-name">numPartitions</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-48" class="py-name" targets="Method pyspark.context.SparkContext.defaultParallelism()=pyspark.context.SparkContext-class.html#defaultParallelism"><a title="pyspark.context.SparkContext.defaultParallelism" class="py-name" href="#" onclick="return doclink('link-48', 'defaultParallelism', 'link-48');">defaultParallelism</a></tt> </tt>
<a name="L354"></a><tt class="py-lineno"> 354</tt>  <tt class="py-line"> </tt>
<a name="L355"></a><tt class="py-lineno"> 355</tt>  <tt class="py-line">        <tt class="py-name">bounds</tt> <tt class="py-op">=</tt> <tt class="py-name">list</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L356"></a><tt class="py-lineno"> 356</tt>  <tt class="py-line"> </tt>
<a name="L357"></a><tt class="py-lineno"> 357</tt>  <tt class="py-line">        <tt class="py-comment"># first compute the boundary of each part via sampling: we want to partition</tt> </tt>
<a name="L358"></a><tt class="py-lineno"> 358</tt>  <tt class="py-line">        <tt class="py-comment"># the key-space into bins such that the bins have roughly the same</tt> </tt>
<a name="L359"></a><tt class="py-lineno"> 359</tt>  <tt class="py-line">        <tt class="py-comment"># number of (key, value) pairs falling into them</tt> </tt>
<a name="L360"></a><tt class="py-lineno"> 360</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">1</tt><tt class="py-op">:</tt> </tt>
<a name="L361"></a><tt class="py-lineno"> 361</tt>  <tt class="py-line">            <tt class="py-name">rddSize</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-49" class="py-name"><a title="pyspark.rdd.RDD.count
pyspark.statcounter.StatCounter.count" class="py-name" href="#" onclick="return doclink('link-49', 'count', 'link-34');">count</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L362"></a><tt class="py-lineno"> 362</tt>  <tt class="py-line">            <tt class="py-name">maxSampleSize</tt> <tt class="py-op">=</tt> <tt class="py-name">numPartitions</tt> <tt class="py-op">*</tt> <tt class="py-number">20.0</tt> <tt class="py-comment"># constant from Spark's RangePartitioner</tt> </tt>
<a name="L363"></a><tt class="py-lineno"> 363</tt>  <tt class="py-line">            <tt class="py-name">fraction</tt> <tt class="py-op">=</tt> <tt class="py-name">min</tt><tt class="py-op">(</tt><tt class="py-name">maxSampleSize</tt> <tt class="py-op">/</tt> <tt class="py-name">max</tt><tt class="py-op">(</tt><tt class="py-name">rddSize</tt><tt class="py-op">,</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-number">1.0</tt><tt class="py-op">)</tt> </tt>
<a name="L364"></a><tt class="py-lineno"> 364</tt>  <tt class="py-line"> </tt>
<a name="L365"></a><tt class="py-lineno"> 365</tt>  <tt class="py-line">            <tt class="py-name">samples</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-50" class="py-name"><a title="pyspark.rdd.RDD.sample" class="py-name" href="#" onclick="return doclink('link-50', 'sample', 'link-35');">sample</a></tt><tt class="py-op">(</tt><tt class="py-name">False</tt><tt class="py-op">,</tt> <tt class="py-name">fraction</tt><tt class="py-op">,</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-51" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-51', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-name">k</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-52" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-52', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L366"></a><tt class="py-lineno"> 366</tt>  <tt class="py-line">            <tt class="py-name">samples</tt> <tt class="py-op">=</tt> <tt class="py-name">sorted</tt><tt class="py-op">(</tt><tt class="py-name">samples</tt><tt class="py-op">,</tt> <tt class="py-name">reverse</tt><tt class="py-op">=</tt><tt class="py-op">(</tt><tt class="py-keyword">not</tt> <tt class="py-name">ascending</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">key</tt><tt class="py-op">=</tt><tt class="py-name">keyfunc</tt><tt class="py-op">)</tt> </tt>
<a name="L367"></a><tt class="py-lineno"> 367</tt>  <tt class="py-line"> </tt>
<a name="L368"></a><tt class="py-lineno"> 368</tt>  <tt class="py-line">            <tt class="py-comment"># we have numPartitions many parts but one of the them has</tt> </tt>
<a name="L369"></a><tt class="py-lineno"> 369</tt>  <tt class="py-line">            <tt class="py-comment"># an implicit boundary</tt> </tt>
<a name="L370"></a><tt class="py-lineno"> 370</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">i</tt> <tt class="py-keyword">in</tt> <tt class="py-name">range</tt><tt class="py-op">(</tt><tt class="py-number">0</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L371"></a><tt class="py-lineno"> 371</tt>  <tt class="py-line">                <tt class="py-name">index</tt> <tt class="py-op">=</tt> <tt class="py-op">(</tt><tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">samples</tt><tt class="py-op">)</tt> <tt class="py-op">-</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> <tt class="py-op">*</tt> <tt class="py-op">(</tt><tt class="py-name">i</tt> <tt class="py-op">+</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> <tt class="py-op">/</tt> <tt class="py-name">numPartitions</tt> </tt>
<a name="L372"></a><tt class="py-lineno"> 372</tt>  <tt class="py-line">                <tt class="py-name">bounds</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">samples</tt><tt class="py-op">[</tt><tt class="py-name">index</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
<a name="L373"></a><tt class="py-lineno"> 373</tt>  <tt class="py-line"> </tt>
<a name="L374"></a><tt class="py-lineno"> 374</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">rangePartitionFunc</tt><tt class="py-op">(</tt><tt class="py-param">k</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L375"></a><tt class="py-lineno"> 375</tt>  <tt class="py-line">            <tt class="py-name">p</tt> <tt class="py-op">=</tt> <tt class="py-number">0</tt> </tt>
<a name="L376"></a><tt class="py-lineno"> 376</tt>  <tt class="py-line">            <tt class="py-keyword">while</tt> <tt class="py-name">p</tt> <tt class="py-op">&lt;</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">bounds</tt><tt class="py-op">)</tt> <tt class="py-keyword">and</tt> <tt class="py-name">keyfunc</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-name">bounds</tt><tt class="py-op">[</tt><tt class="py-name">p</tt><tt class="py-op">]</tt><tt class="py-op">:</tt> </tt>
<a name="L377"></a><tt class="py-lineno"> 377</tt>  <tt class="py-line">                <tt class="py-name">p</tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt>
<a name="L378"></a><tt class="py-lineno"> 378</tt>  <tt class="py-line">            <tt class="py-keyword">if</tt> <tt class="py-name">ascending</tt><tt class="py-op">:</tt> </tt>
<a name="L379"></a><tt class="py-lineno"> 379</tt>  <tt class="py-line">                <tt class="py-keyword">return</tt> <tt class="py-name">p</tt> </tt>
<a name="L380"></a><tt class="py-lineno"> 380</tt>  <tt class="py-line">            <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L381"></a><tt class="py-lineno"> 381</tt>  <tt class="py-line">                <tt class="py-keyword">return</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">-</tt><tt class="py-name">p</tt> </tt>
</div><a name="L382"></a><tt class="py-lineno"> 382</tt>  <tt class="py-line"> </tt>
<a name="L383"></a><tt class="py-lineno"> 383</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">mapFunc</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L384"></a><tt class="py-lineno"> 384</tt>  <tt class="py-line">            <tt class="py-keyword">yield</tt> <tt class="py-name">sorted</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">,</tt> <tt class="py-name">reverse</tt><tt class="py-op">=</tt><tt class="py-op">(</tt><tt class="py-keyword">not</tt> <tt class="py-name">ascending</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">key</tt><tt class="py-op">=</tt><tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-name">keyfunc</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L385"></a><tt class="py-lineno"> 385</tt>  <tt class="py-line"> </tt>
<a name="L386"></a><tt class="py-lineno"> 386</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-53" class="py-name" targets="Method pyspark.rdd.RDD.partitionBy()=pyspark.rdd.RDD-class.html#partitionBy"><a title="pyspark.rdd.RDD.partitionBy" class="py-name" href="#" onclick="return doclink('link-53', 'partitionBy', 'link-53');">partitionBy</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">,</tt> <tt class="py-name">partitionFunc</tt><tt class="py-op">=</tt><tt class="py-name">rangePartitionFunc</tt><tt class="py-op">)</tt> </tt>
<a name="L387"></a><tt class="py-lineno"> 387</tt>  <tt class="py-line">                    <tt class="py-op">.</tt><tt id="link-54" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-54', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">mapFunc</tt><tt class="py-op">,</tt><tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
<a name="L388"></a><tt class="py-lineno"> 388</tt>  <tt class="py-line">                    <tt class="py-op">.</tt><tt id="link-55" class="py-name" targets="Method pyspark.rdd.RDD.flatMap()=pyspark.rdd.RDD-class.html#flatMap"><a title="pyspark.rdd.RDD.flatMap" class="py-name" href="#" onclick="return doclink('link-55', 'flatMap', 'link-55');">flatMap</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L389"></a><tt class="py-lineno"> 389</tt>  <tt class="py-line"> </tt>
<a name="RDD.glom"></a><div id="RDD.glom-def"><a name="L390"></a><tt class="py-lineno"> 390</tt> <a class="py-toggle" href="#" id="RDD.glom-toggle" onclick="return toggle('RDD.glom');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#glom">glom</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.glom-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.glom-expanded"><a name="L391"></a><tt class="py-lineno"> 391</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L392"></a><tt class="py-lineno"> 392</tt>  <tt class="py-line"><tt class="py-docstring">        Return an RDD created by coalescing all elements within each partition</tt> </tt>
<a name="L393"></a><tt class="py-lineno"> 393</tt>  <tt class="py-line"><tt class="py-docstring">        into a list.</tt> </tt>
<a name="L394"></a><tt class="py-lineno"> 394</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L395"></a><tt class="py-lineno"> 395</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4], 2)</tt> </tt>
<a name="L396"></a><tt class="py-lineno"> 396</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.glom().collect())</tt> </tt>
<a name="L397"></a><tt class="py-lineno"> 397</tt>  <tt class="py-line"><tt class="py-docstring">        [[1, 2], [3, 4]]</tt> </tt>
<a name="L398"></a><tt class="py-lineno"> 398</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L399"></a><tt class="py-lineno"> 399</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-keyword">yield</tt> <tt class="py-name">list</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
<a name="L400"></a><tt class="py-lineno"> 400</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-56" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-56', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
</div><a name="L401"></a><tt class="py-lineno"> 401</tt>  <tt class="py-line"> </tt>
<a name="RDD.cartesian"></a><div id="RDD.cartesian-def"><a name="L402"></a><tt class="py-lineno"> 402</tt> <a class="py-toggle" href="#" id="RDD.cartesian-toggle" onclick="return toggle('RDD.cartesian');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#cartesian">cartesian</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.cartesian-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.cartesian-expanded"><a name="L403"></a><tt class="py-lineno"> 403</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L404"></a><tt class="py-lineno"> 404</tt>  <tt class="py-line"><tt class="py-docstring">        Return the Cartesian product of this RDD and another one, that is, the</tt> </tt>
<a name="L405"></a><tt class="py-lineno"> 405</tt>  <tt class="py-line"><tt class="py-docstring">        RDD of all pairs of elements C{(a, b)} where C{a} is in C{self} and</tt> </tt>
<a name="L406"></a><tt class="py-lineno"> 406</tt>  <tt class="py-line"><tt class="py-docstring">        C{b} is in C{other}.</tt> </tt>
<a name="L407"></a><tt class="py-lineno"> 407</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L408"></a><tt class="py-lineno"> 408</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 2])</tt> </tt>
<a name="L409"></a><tt class="py-lineno"> 409</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.cartesian(rdd).collect())</tt> </tt>
<a name="L410"></a><tt class="py-lineno"> 410</tt>  <tt class="py-line"><tt class="py-docstring">        [(1, 1), (1, 2), (2, 1), (2, 2)]</tt> </tt>
<a name="L411"></a><tt class="py-lineno"> 411</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L412"></a><tt class="py-lineno"> 412</tt>  <tt class="py-line">        <tt class="py-comment"># Due to batching, we can't use the Java cartesian method.</tt> </tt>
<a name="L413"></a><tt class="py-lineno"> 413</tt>  <tt class="py-line">        <tt class="py-name">deserializer</tt> <tt class="py-op">=</tt> <tt class="py-name">CartesianDeserializer</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt><tt class="py-op">,</tt> </tt>
<a name="L414"></a><tt class="py-lineno"> 414</tt>  <tt class="py-line">                                             <tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt><tt class="py-op">)</tt> </tt>
<a name="L415"></a><tt class="py-lineno"> 415</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-57" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-57', 'RDD', 'link-40');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-58" class="py-name" targets="Method pyspark.rdd.RDD.cartesian()=pyspark.rdd.RDD-class.html#cartesian"><a title="pyspark.rdd.RDD.cartesian" class="py-name" href="#" onclick="return doclink('link-58', 'cartesian', 'link-58');">cartesian</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">,</tt> <tt class="py-name">deserializer</tt><tt class="py-op">)</tt> </tt>
</div><a name="L416"></a><tt class="py-lineno"> 416</tt>  <tt class="py-line"> </tt>
<a name="RDD.groupBy"></a><div id="RDD.groupBy-def"><a name="L417"></a><tt class="py-lineno"> 417</tt> <a class="py-toggle" href="#" id="RDD.groupBy-toggle" onclick="return toggle('RDD.groupBy');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#groupBy">groupBy</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.groupBy-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.groupBy-expanded"><a name="L418"></a><tt class="py-lineno"> 418</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L419"></a><tt class="py-lineno"> 419</tt>  <tt class="py-line"><tt class="py-docstring">        Return an RDD of grouped items.</tt> </tt>
<a name="L420"></a><tt class="py-lineno"> 420</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L421"></a><tt class="py-lineno"> 421</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([1, 1, 2, 3, 5, 8])</tt> </tt>
<a name="L422"></a><tt class="py-lineno"> 422</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; result = rdd.groupBy(lambda x: x % 2).collect()</tt> </tt>
<a name="L423"></a><tt class="py-lineno"> 423</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted([(x, sorted(y)) for (x, y) in result])</tt> </tt>
<a name="L424"></a><tt class="py-lineno"> 424</tt>  <tt class="py-line"><tt class="py-docstring">        [(0, [2, 8]), (1, [1, 1, 3, 5])]</tt> </tt>
<a name="L425"></a><tt class="py-lineno"> 425</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L426"></a><tt class="py-lineno"> 426</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-59" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-59', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-60" class="py-name" targets="Method pyspark.rdd.RDD.groupByKey()=pyspark.rdd.RDD-class.html#groupByKey"><a title="pyspark.rdd.RDD.groupByKey" class="py-name" href="#" onclick="return doclink('link-60', 'groupByKey', 'link-60');">groupByKey</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L427"></a><tt class="py-lineno"> 427</tt>  <tt class="py-line"> </tt>
<a name="RDD.pipe"></a><div id="RDD.pipe-def"><a name="L428"></a><tt class="py-lineno"> 428</tt> <a class="py-toggle" href="#" id="RDD.pipe-toggle" onclick="return toggle('RDD.pipe');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#pipe">pipe</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">command</tt><tt class="py-op">,</tt> <tt class="py-param">env</tt><tt class="py-op">=</tt><tt class="py-op">{</tt><tt class="py-op">}</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.pipe-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.pipe-expanded"><a name="L429"></a><tt class="py-lineno"> 429</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L430"></a><tt class="py-lineno"> 430</tt>  <tt class="py-line"><tt class="py-docstring">        Return an RDD created by piping elements to a forked external process.</tt> </tt>
<a name="L431"></a><tt class="py-lineno"> 431</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L432"></a><tt class="py-lineno"> 432</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3]).pipe('cat').collect()</tt> </tt>
<a name="L433"></a><tt class="py-lineno"> 433</tt>  <tt class="py-line"><tt class="py-docstring">        ['1', '2', '3']</tt> </tt>
<a name="L434"></a><tt class="py-lineno"> 434</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L435"></a><tt class="py-lineno"> 435</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L436"></a><tt class="py-lineno"> 436</tt>  <tt class="py-line">            <tt id="link-61" class="py-name" targets="Method pyspark.rdd.RDD.pipe()=pyspark.rdd.RDD-class.html#pipe"><a title="pyspark.rdd.RDD.pipe" class="py-name" href="#" onclick="return doclink('link-61', 'pipe', 'link-61');">pipe</a></tt> <tt class="py-op">=</tt> <tt class="py-name">Popen</tt><tt class="py-op">(</tt><tt class="py-name">shlex</tt><tt class="py-op">.</tt><tt class="py-name">split</tt><tt class="py-op">(</tt><tt class="py-name">command</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">env</tt><tt class="py-op">=</tt><tt class="py-name">env</tt><tt class="py-op">,</tt> <tt class="py-name">stdin</tt><tt class="py-op">=</tt><tt class="py-name">PIPE</tt><tt class="py-op">,</tt> <tt class="py-name">stdout</tt><tt class="py-op">=</tt><tt class="py-name">PIPE</tt><tt class="py-op">)</tt> </tt>
<a name="L437"></a><tt class="py-lineno"> 437</tt>  <tt class="py-line">            <tt class="py-keyword">def</tt> <tt class="py-def-name">pipe_objs</tt><tt class="py-op">(</tt><tt class="py-param">out</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L438"></a><tt class="py-lineno"> 438</tt>  <tt class="py-line">                <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L439"></a><tt class="py-lineno"> 439</tt>  <tt class="py-line">                    <tt class="py-name">out</tt><tt class="py-op">.</tt><tt class="py-name">write</tt><tt class="py-op">(</tt><tt class="py-name">str</tt><tt class="py-op">(</tt><tt class="py-name">obj</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">rstrip</tt><tt class="py-op">(</tt><tt class="py-string">'\n'</tt><tt class="py-op">)</tt> <tt class="py-op">+</tt> <tt class="py-string">'\n'</tt><tt class="py-op">)</tt> </tt>
<a name="L440"></a><tt class="py-lineno"> 440</tt>  <tt class="py-line">                <tt class="py-name">out</tt><tt class="py-op">.</tt><tt class="py-name">close</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L441"></a><tt class="py-lineno"> 441</tt>  <tt class="py-line">            <tt class="py-name">Thread</tt><tt class="py-op">(</tt><tt class="py-name">target</tt><tt class="py-op">=</tt><tt class="py-name">pipe_objs</tt><tt class="py-op">,</tt> <tt class="py-name">args</tt><tt class="py-op">=</tt><tt class="py-op">[</tt><tt id="link-62" class="py-name"><a title="pyspark.rdd.RDD.pipe" class="py-name" href="#" onclick="return doclink('link-62', 'pipe', 'link-61');">pipe</a></tt><tt class="py-op">.</tt><tt class="py-name">stdin</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">start</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L442"></a><tt class="py-lineno"> 442</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">rstrip</tt><tt class="py-op">(</tt><tt class="py-string">'\n'</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt id="link-63" class="py-name"><a title="pyspark.rdd.RDD.pipe" class="py-name" href="#" onclick="return doclink('link-63', 'pipe', 'link-61');">pipe</a></tt><tt class="py-op">.</tt><tt class="py-name">stdout</tt><tt class="py-op">)</tt> </tt>
</div><a name="L443"></a><tt class="py-lineno"> 443</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-64" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-64', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
</div><a name="L444"></a><tt class="py-lineno"> 444</tt>  <tt class="py-line"> </tt>
<a name="RDD.foreach"></a><div id="RDD.foreach-def"><a name="L445"></a><tt class="py-lineno"> 445</tt> <a class="py-toggle" href="#" id="RDD.foreach-toggle" onclick="return toggle('RDD.foreach');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#foreach">foreach</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.foreach-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.foreach-expanded"><a name="L446"></a><tt class="py-lineno"> 446</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L447"></a><tt class="py-lineno"> 447</tt>  <tt class="py-line"><tt class="py-docstring">        Applies a function to all elements of this RDD.</tt> </tt>
<a name="L448"></a><tt class="py-lineno"> 448</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L449"></a><tt class="py-lineno"> 449</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; def f(x): print x</tt> </tt>
<a name="L450"></a><tt class="py-lineno"> 450</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5]).foreach(f)</tt> </tt>
<a name="L451"></a><tt class="py-lineno"> 451</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L452"></a><tt class="py-lineno"> 452</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">processPartition</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L453"></a><tt class="py-lineno"> 453</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L454"></a><tt class="py-lineno"> 454</tt>  <tt class="py-line">                <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
<a name="L455"></a><tt class="py-lineno"> 455</tt>  <tt class="py-line">            <tt class="py-keyword">yield</tt> <tt class="py-name">None</tt> </tt>
</div><a name="L456"></a><tt class="py-lineno"> 456</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-65" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-65', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">processPartition</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-66" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-66', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt>  <tt class="py-comment"># Force evaluation</tt> </tt>
</div><a name="L457"></a><tt class="py-lineno"> 457</tt>  <tt class="py-line"> </tt>
<a name="RDD.collect"></a><div id="RDD.collect-def"><a name="L458"></a><tt class="py-lineno"> 458</tt> <a class="py-toggle" href="#" id="RDD.collect-toggle" onclick="return toggle('RDD.collect');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#collect">collect</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.collect-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.collect-expanded"><a name="L459"></a><tt class="py-lineno"> 459</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L460"></a><tt class="py-lineno"> 460</tt>  <tt class="py-line"><tt class="py-docstring">        Return a list that contains all of the elements in this RDD.</tt> </tt>
<a name="L461"></a><tt class="py-lineno"> 461</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L462"></a><tt class="py-lineno"> 462</tt>  <tt class="py-line">        <tt class="py-keyword">with</tt> <tt class="py-name">_JavaStackTrace</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-67" class="py-name" targets="Module pyspark.context=pyspark.context-module.html,Method pyspark.rdd.RDD.context()=pyspark.rdd.RDD-class.html#context"><a title="pyspark.context
pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-67', 'context', 'link-67');">context</a></tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">st</tt><tt class="py-op">:</tt> </tt>
<a name="L463"></a><tt class="py-lineno"> 463</tt>  <tt class="py-line">          <tt class="py-name">bytesInJava</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-68" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-68', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">iterator</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L464"></a><tt class="py-lineno"> 464</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">list</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_collect_iterator_through_file</tt><tt class="py-op">(</tt><tt class="py-name">bytesInJava</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L465"></a><tt class="py-lineno"> 465</tt>  <tt class="py-line"> </tt>
<a name="RDD._collect_iterator_through_file"></a><div id="RDD._collect_iterator_through_file-def"><a name="L466"></a><tt class="py-lineno"> 466</tt> <a class="py-toggle" href="#" id="RDD._collect_iterator_through_file-toggle" onclick="return toggle('RDD._collect_iterator_through_file');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#_collect_iterator_through_file">_collect_iterator_through_file</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD._collect_iterator_through_file-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD._collect_iterator_through_file-expanded"><a name="L467"></a><tt class="py-lineno"> 467</tt>  <tt class="py-line">        <tt class="py-comment"># Transferring lots of data through Py4J can be slow because</tt> </tt>
<a name="L468"></a><tt class="py-lineno"> 468</tt>  <tt class="py-line">        <tt class="py-comment"># socket.readline() is inefficient.  Instead, we'll dump the data to a</tt> </tt>
<a name="L469"></a><tt class="py-lineno"> 469</tt>  <tt class="py-line">        <tt class="py-comment"># file and read it back.</tt> </tt>
<a name="L470"></a><tt class="py-lineno"> 470</tt>  <tt class="py-line">        <tt class="py-name">tempFile</tt> <tt class="py-op">=</tt> <tt class="py-name">NamedTemporaryFile</tt><tt class="py-op">(</tt><tt class="py-name">delete</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">,</tt> <tt class="py-name">dir</tt><tt class="py-op">=</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_temp_dir</tt><tt class="py-op">)</tt> </tt>
<a name="L471"></a><tt class="py-lineno"> 471</tt>  <tt class="py-line">        <tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">close</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L472"></a><tt class="py-lineno"> 472</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-69" class="py-name" targets="Variable pyspark.context.SparkContext._writeToFile=pyspark.context.SparkContext-class.html#_writeToFile"><a title="pyspark.context.SparkContext._writeToFile" class="py-name" href="#" onclick="return doclink('link-69', '_writeToFile', 'link-69');">_writeToFile</a></tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">,</tt> <tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">name</tt><tt class="py-op">)</tt> </tt>
<a name="L473"></a><tt class="py-lineno"> 473</tt>  <tt class="py-line">        <tt class="py-comment"># Read the data into Python and deserialize it:</tt> </tt>
<a name="L474"></a><tt class="py-lineno"> 474</tt>  <tt class="py-line">        <tt class="py-keyword">with</tt> <tt class="py-name">open</tt><tt class="py-op">(</tt><tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">name</tt><tt class="py-op">,</tt> <tt class="py-string">'rb'</tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">tempFile</tt><tt class="py-op">:</tt> </tt>
<a name="L475"></a><tt class="py-lineno"> 475</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">item</tt> <tt class="py-keyword">in</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt><tt class="py-op">.</tt><tt class="py-name">load_stream</tt><tt class="py-op">(</tt><tt class="py-name">tempFile</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L476"></a><tt class="py-lineno"> 476</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">item</tt> </tt>
<a name="L477"></a><tt class="py-lineno"> 477</tt>  <tt class="py-line">        <tt class="py-name">os</tt><tt class="py-op">.</tt><tt class="py-name">unlink</tt><tt class="py-op">(</tt><tt class="py-name">tempFile</tt><tt class="py-op">.</tt><tt class="py-name">name</tt><tt class="py-op">)</tt> </tt>
</div><a name="L478"></a><tt class="py-lineno"> 478</tt>  <tt class="py-line"> </tt>
<a name="RDD.reduce"></a><div id="RDD.reduce-def"><a name="L479"></a><tt class="py-lineno"> 479</tt> <a class="py-toggle" href="#" id="RDD.reduce-toggle" onclick="return toggle('RDD.reduce');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#reduce">reduce</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.reduce-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.reduce-expanded"><a name="L480"></a><tt class="py-lineno"> 480</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L481"></a><tt class="py-lineno"> 481</tt>  <tt class="py-line"><tt class="py-docstring">        Reduces the elements of this RDD using the specified commutative and</tt> </tt>
<a name="L482"></a><tt class="py-lineno"> 482</tt>  <tt class="py-line"><tt class="py-docstring">        associative binary operator.</tt> </tt>
<a name="L483"></a><tt class="py-lineno"> 483</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L484"></a><tt class="py-lineno"> 484</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from operator import add</tt> </tt>
<a name="L485"></a><tt class="py-lineno"> 485</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5]).reduce(add)</tt> </tt>
<a name="L486"></a><tt class="py-lineno"> 486</tt>  <tt class="py-line"><tt class="py-docstring">        15</tt> </tt>
<a name="L487"></a><tt class="py-lineno"> 487</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add)</tt> </tt>
<a name="L488"></a><tt class="py-lineno"> 488</tt>  <tt class="py-line"><tt class="py-docstring">        10</tt> </tt>
<a name="L489"></a><tt class="py-lineno"> 489</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L490"></a><tt class="py-lineno"> 490</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L491"></a><tt class="py-lineno"> 491</tt>  <tt class="py-line">            <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt> </tt>
<a name="L492"></a><tt class="py-lineno"> 492</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L493"></a><tt class="py-lineno"> 493</tt>  <tt class="py-line">                <tt class="py-keyword">if</tt> <tt class="py-name">acc</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
<a name="L494"></a><tt class="py-lineno"> 494</tt>  <tt class="py-line">                    <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">obj</tt> </tt>
<a name="L495"></a><tt class="py-lineno"> 495</tt>  <tt class="py-line">                <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L496"></a><tt class="py-lineno"> 496</tt>  <tt class="py-line">                    <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">obj</tt><tt class="py-op">,</tt> <tt class="py-name">acc</tt><tt class="py-op">)</tt> </tt>
<a name="L497"></a><tt class="py-lineno"> 497</tt>  <tt class="py-line">            <tt class="py-keyword">if</tt> <tt class="py-name">acc</tt> <tt class="py-keyword">is</tt> <tt class="py-keyword">not</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
<a name="L498"></a><tt class="py-lineno"> 498</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">acc</tt> </tt>
</div><a name="L499"></a><tt class="py-lineno"> 499</tt>  <tt class="py-line">        <tt class="py-name">vals</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-70" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-70', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-71" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-71', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L500"></a><tt class="py-lineno"> 500</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-72" class="py-name" targets="Method pyspark.rdd.RDD.reduce()=pyspark.rdd.RDD-class.html#reduce"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-72', 'reduce', 'link-72');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">,</tt> <tt class="py-name">vals</tt><tt class="py-op">)</tt> </tt>
</div><a name="L501"></a><tt class="py-lineno"> 501</tt>  <tt class="py-line"> </tt>
<a name="RDD.fold"></a><div id="RDD.fold-def"><a name="L502"></a><tt class="py-lineno"> 502</tt> <a class="py-toggle" href="#" id="RDD.fold-toggle" onclick="return toggle('RDD.fold');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#fold">fold</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">zeroValue</tt><tt class="py-op">,</tt> <tt class="py-param">op</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.fold-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.fold-expanded"><a name="L503"></a><tt class="py-lineno"> 503</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L504"></a><tt class="py-lineno"> 504</tt>  <tt class="py-line"><tt class="py-docstring">        Aggregate the elements of each partition, and then the results for all</tt> </tt>
<a name="L505"></a><tt class="py-lineno"> 505</tt>  <tt class="py-line"><tt class="py-docstring">        the partitions, using a given associative function and a neutral "zero</tt> </tt>
<a name="L506"></a><tt class="py-lineno"> 506</tt>  <tt class="py-line"><tt class="py-docstring">        value."</tt> </tt>
<a name="L507"></a><tt class="py-lineno"> 507</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L508"></a><tt class="py-lineno"> 508</tt>  <tt class="py-line"><tt class="py-docstring">        The function C{op(t1, t2)} is allowed to modify C{t1} and return it</tt> </tt>
<a name="L509"></a><tt class="py-lineno"> 509</tt>  <tt class="py-line"><tt class="py-docstring">        as its result value to avoid object allocation; however, it should not</tt> </tt>
<a name="L510"></a><tt class="py-lineno"> 510</tt>  <tt class="py-line"><tt class="py-docstring">        modify C{t2}.</tt> </tt>
<a name="L511"></a><tt class="py-lineno"> 511</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L512"></a><tt class="py-lineno"> 512</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from operator import add</tt> </tt>
<a name="L513"></a><tt class="py-lineno"> 513</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)</tt> </tt>
<a name="L514"></a><tt class="py-lineno"> 514</tt>  <tt class="py-line"><tt class="py-docstring">        15</tt> </tt>
<a name="L515"></a><tt class="py-lineno"> 515</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L516"></a><tt class="py-lineno"> 516</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L517"></a><tt class="py-lineno"> 517</tt>  <tt class="py-line">            <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">zeroValue</tt> </tt>
<a name="L518"></a><tt class="py-lineno"> 518</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L519"></a><tt class="py-lineno"> 519</tt>  <tt class="py-line">                <tt class="py-name">acc</tt> <tt class="py-op">=</tt> <tt class="py-name">op</tt><tt class="py-op">(</tt><tt class="py-name">obj</tt><tt class="py-op">,</tt> <tt class="py-name">acc</tt><tt class="py-op">)</tt> </tt>
<a name="L520"></a><tt class="py-lineno"> 520</tt>  <tt class="py-line">            <tt class="py-keyword">yield</tt> <tt class="py-name">acc</tt> </tt>
</div><a name="L521"></a><tt class="py-lineno"> 521</tt>  <tt class="py-line">        <tt class="py-name">vals</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-73" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-73', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">func</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-74" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-74', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L522"></a><tt class="py-lineno"> 522</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-75" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-75', 'reduce', 'link-72');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">op</tt><tt class="py-op">,</tt> <tt class="py-name">vals</tt><tt class="py-op">,</tt> <tt class="py-name">zeroValue</tt><tt class="py-op">)</tt> </tt>
</div><a name="L523"></a><tt class="py-lineno"> 523</tt>  <tt class="py-line"> </tt>
<a name="L524"></a><tt class="py-lineno"> 524</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: aggregate</tt> </tt>
<a name="L525"></a><tt class="py-lineno"> 525</tt>  <tt class="py-line"> </tt>
<a name="RDD.sum"></a><div id="RDD.sum-def"><a name="L526"></a><tt class="py-lineno"> 526</tt> <a class="py-toggle" href="#" id="RDD.sum-toggle" onclick="return toggle('RDD.sum');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#sum">sum</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.sum-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.sum-expanded"><a name="L527"></a><tt class="py-lineno"> 527</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L528"></a><tt class="py-lineno"> 528</tt>  <tt class="py-line"><tt class="py-docstring">        Add up the elements in this RDD.</tt> </tt>
<a name="L529"></a><tt class="py-lineno"> 529</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L530"></a><tt class="py-lineno"> 530</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1.0, 2.0, 3.0]).sum()</tt> </tt>
<a name="L531"></a><tt class="py-lineno"> 531</tt>  <tt class="py-line"><tt class="py-docstring">        6.0</tt> </tt>
<a name="L532"></a><tt class="py-lineno"> 532</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L533"></a><tt class="py-lineno"> 533</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-76" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-76', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">[</tt><tt id="link-77" class="py-name" targets="Method pyspark.rdd.RDD.sum()=pyspark.rdd.RDD-class.html#sum,Method pyspark.statcounter.StatCounter.sum()=pyspark.statcounter.StatCounter-class.html#sum"><a title="pyspark.rdd.RDD.sum
pyspark.statcounter.StatCounter.sum" class="py-name" href="#" onclick="return doclink('link-77', 'sum', 'link-77');">sum</a></tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-78" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-78', 'reduce', 'link-72');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">operator</tt><tt class="py-op">.</tt><tt id="link-79" class="py-name" targets="Method pyspark.accumulators.Accumulator.add()=pyspark.accumulators.Accumulator-class.html#add"><a title="pyspark.accumulators.Accumulator.add" class="py-name" href="#" onclick="return doclink('link-79', 'add', 'link-79');">add</a></tt><tt class="py-op">)</tt> </tt>
</div><a name="L534"></a><tt class="py-lineno"> 534</tt>  <tt class="py-line"> </tt>
<a name="RDD.count"></a><div id="RDD.count-def"><a name="L535"></a><tt class="py-lineno"> 535</tt> <a class="py-toggle" href="#" id="RDD.count-toggle" onclick="return toggle('RDD.count');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#count">count</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.count-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.count-expanded"><a name="L536"></a><tt class="py-lineno"> 536</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L537"></a><tt class="py-lineno"> 537</tt>  <tt class="py-line"><tt class="py-docstring">        Return the number of elements in this RDD.</tt> </tt>
<a name="L538"></a><tt class="py-lineno"> 538</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L539"></a><tt class="py-lineno"> 539</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([2, 3, 4]).count()</tt> </tt>
<a name="L540"></a><tt class="py-lineno"> 540</tt>  <tt class="py-line"><tt class="py-docstring">        3</tt> </tt>
<a name="L541"></a><tt class="py-lineno"> 541</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L542"></a><tt class="py-lineno"> 542</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-80" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-80', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">i</tt><tt class="py-op">:</tt> <tt class="py-op">[</tt><tt id="link-81" class="py-name"><a title="pyspark.rdd.RDD.sum
pyspark.statcounter.StatCounter.sum" class="py-name" href="#" onclick="return doclink('link-81', 'sum', 'link-77');">sum</a></tt><tt class="py-op">(</tt><tt class="py-number">1</tt> <tt class="py-keyword">for</tt> <tt class="py-name">_</tt> <tt class="py-keyword">in</tt> <tt class="py-name">i</tt><tt class="py-op">)</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-82" class="py-name"><a title="pyspark.rdd.RDD.sum
pyspark.statcounter.StatCounter.sum" class="py-name" href="#" onclick="return doclink('link-82', 'sum', 'link-77');">sum</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L543"></a><tt class="py-lineno"> 543</tt>  <tt class="py-line"> </tt>
<a name="RDD.stats"></a><div id="RDD.stats-def"><a name="L544"></a><tt class="py-lineno"> 544</tt> <a class="py-toggle" href="#" id="RDD.stats-toggle" onclick="return toggle('RDD.stats');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#stats">stats</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.stats-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.stats-expanded"><a name="L545"></a><tt class="py-lineno"> 545</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L546"></a><tt class="py-lineno"> 546</tt>  <tt class="py-line"><tt class="py-docstring">        Return a L{StatCounter} object that captures the mean, variance</tt> </tt>
<a name="L547"></a><tt class="py-lineno"> 547</tt>  <tt class="py-line"><tt class="py-docstring">        and count of the RDD's elements in one operation.</tt> </tt>
<a name="L548"></a><tt class="py-lineno"> 548</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L549"></a><tt class="py-lineno"> 549</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">redFunc</tt><tt class="py-op">(</tt><tt class="py-param">left_counter</tt><tt class="py-op">,</tt> <tt class="py-param">right_counter</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L550"></a><tt class="py-lineno"> 550</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">left_counter</tt><tt class="py-op">.</tt><tt id="link-83" class="py-name" targets="Method pyspark.statcounter.StatCounter.mergeStats()=pyspark.statcounter.StatCounter-class.html#mergeStats"><a title="pyspark.statcounter.StatCounter.mergeStats" class="py-name" href="#" onclick="return doclink('link-83', 'mergeStats', 'link-83');">mergeStats</a></tt><tt class="py-op">(</tt><tt class="py-name">right_counter</tt><tt class="py-op">)</tt> </tt>
</div><a name="L551"></a><tt class="py-lineno"> 551</tt>  <tt class="py-line"> </tt>
<a name="L552"></a><tt class="py-lineno"> 552</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-84" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-84', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">i</tt><tt class="py-op">:</tt> <tt class="py-op">[</tt><tt id="link-85" class="py-name"><a title="pyspark.statcounter.StatCounter" class="py-name" href="#" onclick="return doclink('link-85', 'StatCounter', 'link-7');">StatCounter</a></tt><tt class="py-op">(</tt><tt class="py-name">i</tt><tt class="py-op">)</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-86" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-86', 'reduce', 'link-72');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">redFunc</tt><tt class="py-op">)</tt> </tt>
</div><a name="L553"></a><tt class="py-lineno"> 553</tt>  <tt class="py-line"> </tt>
<a name="RDD.mean"></a><div id="RDD.mean-def"><a name="L554"></a><tt class="py-lineno"> 554</tt> <a class="py-toggle" href="#" id="RDD.mean-toggle" onclick="return toggle('RDD.mean');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mean">mean</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.mean-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.mean-expanded"><a name="L555"></a><tt class="py-lineno"> 555</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L556"></a><tt class="py-lineno"> 556</tt>  <tt class="py-line"><tt class="py-docstring">        Compute the mean of this RDD's elements.</tt> </tt>
<a name="L557"></a><tt class="py-lineno"> 557</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L558"></a><tt class="py-lineno"> 558</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3]).mean()</tt> </tt>
<a name="L559"></a><tt class="py-lineno"> 559</tt>  <tt class="py-line"><tt class="py-docstring">        2.0</tt> </tt>
<a name="L560"></a><tt class="py-lineno"> 560</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L561"></a><tt class="py-lineno"> 561</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-87" class="py-name" targets="Method pyspark.rdd.RDD.stats()=pyspark.rdd.RDD-class.html#stats"><a title="pyspark.rdd.RDD.stats" class="py-name" href="#" onclick="return doclink('link-87', 'stats', 'link-87');">stats</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-88" class="py-name" targets="Method pyspark.rdd.RDD.mean()=pyspark.rdd.RDD-class.html#mean,Method pyspark.statcounter.StatCounter.mean()=pyspark.statcounter.StatCounter-class.html#mean"><a title="pyspark.rdd.RDD.mean
pyspark.statcounter.StatCounter.mean" class="py-name" href="#" onclick="return doclink('link-88', 'mean', 'link-88');">mean</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L562"></a><tt class="py-lineno"> 562</tt>  <tt class="py-line"> </tt>
<a name="RDD.variance"></a><div id="RDD.variance-def"><a name="L563"></a><tt class="py-lineno"> 563</tt> <a class="py-toggle" href="#" id="RDD.variance-toggle" onclick="return toggle('RDD.variance');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#variance">variance</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.variance-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.variance-expanded"><a name="L564"></a><tt class="py-lineno"> 564</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L565"></a><tt class="py-lineno"> 565</tt>  <tt class="py-line"><tt class="py-docstring">        Compute the variance of this RDD's elements.</tt> </tt>
<a name="L566"></a><tt class="py-lineno"> 566</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L567"></a><tt class="py-lineno"> 567</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3]).variance()</tt> </tt>
<a name="L568"></a><tt class="py-lineno"> 568</tt>  <tt class="py-line"><tt class="py-docstring">        0.666...</tt> </tt>
<a name="L569"></a><tt class="py-lineno"> 569</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L570"></a><tt class="py-lineno"> 570</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-89" class="py-name"><a title="pyspark.rdd.RDD.stats" class="py-name" href="#" onclick="return doclink('link-89', 'stats', 'link-87');">stats</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-90" class="py-name" targets="Method pyspark.rdd.RDD.variance()=pyspark.rdd.RDD-class.html#variance,Method pyspark.statcounter.StatCounter.variance()=pyspark.statcounter.StatCounter-class.html#variance"><a title="pyspark.rdd.RDD.variance
pyspark.statcounter.StatCounter.variance" class="py-name" href="#" onclick="return doclink('link-90', 'variance', 'link-90');">variance</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L571"></a><tt class="py-lineno"> 571</tt>  <tt class="py-line"> </tt>
<a name="RDD.stdev"></a><div id="RDD.stdev-def"><a name="L572"></a><tt class="py-lineno"> 572</tt> <a class="py-toggle" href="#" id="RDD.stdev-toggle" onclick="return toggle('RDD.stdev');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#stdev">stdev</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.stdev-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.stdev-expanded"><a name="L573"></a><tt class="py-lineno"> 573</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L574"></a><tt class="py-lineno"> 574</tt>  <tt class="py-line"><tt class="py-docstring">        Compute the standard deviation of this RDD's elements.</tt> </tt>
<a name="L575"></a><tt class="py-lineno"> 575</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L576"></a><tt class="py-lineno"> 576</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3]).stdev()</tt> </tt>
<a name="L577"></a><tt class="py-lineno"> 577</tt>  <tt class="py-line"><tt class="py-docstring">        0.816...</tt> </tt>
<a name="L578"></a><tt class="py-lineno"> 578</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L579"></a><tt class="py-lineno"> 579</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-91" class="py-name"><a title="pyspark.rdd.RDD.stats" class="py-name" href="#" onclick="return doclink('link-91', 'stats', 'link-87');">stats</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-92" class="py-name" targets="Method pyspark.rdd.RDD.stdev()=pyspark.rdd.RDD-class.html#stdev,Method pyspark.statcounter.StatCounter.stdev()=pyspark.statcounter.StatCounter-class.html#stdev"><a title="pyspark.rdd.RDD.stdev
pyspark.statcounter.StatCounter.stdev" class="py-name" href="#" onclick="return doclink('link-92', 'stdev', 'link-92');">stdev</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L580"></a><tt class="py-lineno"> 580</tt>  <tt class="py-line"> </tt>
<a name="RDD.sampleStdev"></a><div id="RDD.sampleStdev-def"><a name="L581"></a><tt class="py-lineno"> 581</tt> <a class="py-toggle" href="#" id="RDD.sampleStdev-toggle" onclick="return toggle('RDD.sampleStdev');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#sampleStdev">sampleStdev</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.sampleStdev-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.sampleStdev-expanded"><a name="L582"></a><tt class="py-lineno"> 582</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L583"></a><tt class="py-lineno"> 583</tt>  <tt class="py-line"><tt class="py-docstring">        Compute the sample standard deviation of this RDD's elements (which corrects for bias in</tt> </tt>
<a name="L584"></a><tt class="py-lineno"> 584</tt>  <tt class="py-line"><tt class="py-docstring">        estimating the standard deviation by dividing by N-1 instead of N).</tt> </tt>
<a name="L585"></a><tt class="py-lineno"> 585</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L586"></a><tt class="py-lineno"> 586</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3]).sampleStdev()</tt> </tt>
<a name="L587"></a><tt class="py-lineno"> 587</tt>  <tt class="py-line"><tt class="py-docstring">        1.0</tt> </tt>
<a name="L588"></a><tt class="py-lineno"> 588</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L589"></a><tt class="py-lineno"> 589</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-93" class="py-name"><a title="pyspark.rdd.RDD.stats" class="py-name" href="#" onclick="return doclink('link-93', 'stats', 'link-87');">stats</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-94" class="py-name" targets="Method pyspark.rdd.RDD.sampleStdev()=pyspark.rdd.RDD-class.html#sampleStdev,Method pyspark.statcounter.StatCounter.sampleStdev()=pyspark.statcounter.StatCounter-class.html#sampleStdev"><a title="pyspark.rdd.RDD.sampleStdev
pyspark.statcounter.StatCounter.sampleStdev" class="py-name" href="#" onclick="return doclink('link-94', 'sampleStdev', 'link-94');">sampleStdev</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L590"></a><tt class="py-lineno"> 590</tt>  <tt class="py-line"> </tt>
<a name="RDD.sampleVariance"></a><div id="RDD.sampleVariance-def"><a name="L591"></a><tt class="py-lineno"> 591</tt> <a class="py-toggle" href="#" id="RDD.sampleVariance-toggle" onclick="return toggle('RDD.sampleVariance');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#sampleVariance">sampleVariance</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.sampleVariance-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.sampleVariance-expanded"><a name="L592"></a><tt class="py-lineno"> 592</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L593"></a><tt class="py-lineno"> 593</tt>  <tt class="py-line"><tt class="py-docstring">        Compute the sample variance of this RDD's elements (which corrects for bias in</tt> </tt>
<a name="L594"></a><tt class="py-lineno"> 594</tt>  <tt class="py-line"><tt class="py-docstring">        estimating the variance by dividing by N-1 instead of N).</tt> </tt>
<a name="L595"></a><tt class="py-lineno"> 595</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L596"></a><tt class="py-lineno"> 596</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([1, 2, 3]).sampleVariance()</tt> </tt>
<a name="L597"></a><tt class="py-lineno"> 597</tt>  <tt class="py-line"><tt class="py-docstring">        1.0</tt> </tt>
<a name="L598"></a><tt class="py-lineno"> 598</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L599"></a><tt class="py-lineno"> 599</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-95" class="py-name"><a title="pyspark.rdd.RDD.stats" class="py-name" href="#" onclick="return doclink('link-95', 'stats', 'link-87');">stats</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-96" class="py-name" targets="Method pyspark.rdd.RDD.sampleVariance()=pyspark.rdd.RDD-class.html#sampleVariance,Method pyspark.statcounter.StatCounter.sampleVariance()=pyspark.statcounter.StatCounter-class.html#sampleVariance"><a title="pyspark.rdd.RDD.sampleVariance
pyspark.statcounter.StatCounter.sampleVariance" class="py-name" href="#" onclick="return doclink('link-96', 'sampleVariance', 'link-96');">sampleVariance</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L600"></a><tt class="py-lineno"> 600</tt>  <tt class="py-line"> </tt>
<a name="RDD.countByValue"></a><div id="RDD.countByValue-def"><a name="L601"></a><tt class="py-lineno"> 601</tt> <a class="py-toggle" href="#" id="RDD.countByValue-toggle" onclick="return toggle('RDD.countByValue');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#countByValue">countByValue</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.countByValue-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.countByValue-expanded"><a name="L602"></a><tt class="py-lineno"> 602</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L603"></a><tt class="py-lineno"> 603</tt>  <tt class="py-line"><tt class="py-docstring">        Return the count of each unique value in this RDD as a dictionary of</tt> </tt>
<a name="L604"></a><tt class="py-lineno"> 604</tt>  <tt class="py-line"><tt class="py-docstring">        (value, count) pairs.</tt> </tt>
<a name="L605"></a><tt class="py-lineno"> 605</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L606"></a><tt class="py-lineno"> 606</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())</tt> </tt>
<a name="L607"></a><tt class="py-lineno"> 607</tt>  <tt class="py-line"><tt class="py-docstring">        [(1, 2), (2, 3)]</tt> </tt>
<a name="L608"></a><tt class="py-lineno"> 608</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L609"></a><tt class="py-lineno"> 609</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">countPartition</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L610"></a><tt class="py-lineno"> 610</tt>  <tt class="py-line">            <tt class="py-name">counts</tt> <tt class="py-op">=</tt> <tt class="py-name">defaultdict</tt><tt class="py-op">(</tt><tt class="py-name">int</tt><tt class="py-op">)</tt> </tt>
<a name="L611"></a><tt class="py-lineno"> 611</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">obj</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L612"></a><tt class="py-lineno"> 612</tt>  <tt class="py-line">                <tt class="py-name">counts</tt><tt class="py-op">[</tt><tt class="py-name">obj</tt><tt class="py-op">]</tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt>
<a name="L613"></a><tt class="py-lineno"> 613</tt>  <tt class="py-line">            <tt class="py-keyword">yield</tt> <tt class="py-name">counts</tt> </tt>
</div><a name="L614"></a><tt class="py-lineno"> 614</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeMaps</tt><tt class="py-op">(</tt><tt class="py-param">m1</tt><tt class="py-op">,</tt> <tt class="py-param">m2</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L615"></a><tt class="py-lineno"> 615</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m2</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L616"></a><tt class="py-lineno"> 616</tt>  <tt class="py-line">                <tt class="py-name">m1</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">+=</tt> <tt class="py-name">v</tt> </tt>
<a name="L617"></a><tt class="py-lineno"> 617</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">m1</tt> </tt>
</div><a name="L618"></a><tt class="py-lineno"> 618</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-97" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-97', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">countPartition</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-98" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-98', 'reduce', 'link-72');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">mergeMaps</tt><tt class="py-op">)</tt> </tt>
</div><a name="L619"></a><tt class="py-lineno"> 619</tt>  <tt class="py-line"> </tt>
<a name="RDD.take"></a><div id="RDD.take-def"><a name="L620"></a><tt class="py-lineno"> 620</tt> <a class="py-toggle" href="#" id="RDD.take-toggle" onclick="return toggle('RDD.take');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#take">take</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">num</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.take-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.take-expanded"><a name="L621"></a><tt class="py-lineno"> 621</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L622"></a><tt class="py-lineno"> 622</tt>  <tt class="py-line"><tt class="py-docstring">        Take the first num elements of the RDD.</tt> </tt>
<a name="L623"></a><tt class="py-lineno"> 623</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L624"></a><tt class="py-lineno"> 624</tt>  <tt class="py-line"><tt class="py-docstring">        This currently scans the partitions *one by one*, so it will be slow if</tt> </tt>
<a name="L625"></a><tt class="py-lineno"> 625</tt>  <tt class="py-line"><tt class="py-docstring">        a lot of partitions are required. In that case, use L{collect} to get</tt> </tt>
<a name="L626"></a><tt class="py-lineno"> 626</tt>  <tt class="py-line"><tt class="py-docstring">        the whole RDD instead.</tt> </tt>
<a name="L627"></a><tt class="py-lineno"> 627</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L628"></a><tt class="py-lineno"> 628</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)</tt> </tt>
<a name="L629"></a><tt class="py-lineno"> 629</tt>  <tt class="py-line"><tt class="py-docstring">        [2, 3]</tt> </tt>
<a name="L630"></a><tt class="py-lineno"> 630</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([2, 3, 4, 5, 6]).take(10)</tt> </tt>
<a name="L631"></a><tt class="py-lineno"> 631</tt>  <tt class="py-line"><tt class="py-docstring">        [2, 3, 4, 5, 6]</tt> </tt>
<a name="L632"></a><tt class="py-lineno"> 632</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L633"></a><tt class="py-lineno"> 633</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">takeUpToNum</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L634"></a><tt class="py-lineno"> 634</tt>  <tt class="py-line">            <tt class="py-name">taken</tt> <tt class="py-op">=</tt> <tt class="py-number">0</tt> </tt>
<a name="L635"></a><tt class="py-lineno"> 635</tt>  <tt class="py-line">            <tt class="py-keyword">while</tt> <tt class="py-name">taken</tt> <tt class="py-op">&lt;</tt> <tt class="py-name">num</tt><tt class="py-op">:</tt> </tt>
<a name="L636"></a><tt class="py-lineno"> 636</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">next</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt> </tt>
<a name="L637"></a><tt class="py-lineno"> 637</tt>  <tt class="py-line">                <tt class="py-name">taken</tt> <tt class="py-op">+=</tt> <tt class="py-number">1</tt> </tt>
</div><a name="L638"></a><tt class="py-lineno"> 638</tt>  <tt class="py-line">        <tt class="py-comment"># Take only up to num elements from each partition we try</tt> </tt>
<a name="L639"></a><tt class="py-lineno"> 639</tt>  <tt class="py-line">        <tt class="py-name">mapped</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-99" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-99', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">takeUpToNum</tt><tt class="py-op">)</tt> </tt>
<a name="L640"></a><tt class="py-lineno"> 640</tt>  <tt class="py-line">        <tt class="py-name">items</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-op">]</tt> </tt>
<a name="L641"></a><tt class="py-lineno"> 641</tt>  <tt class="py-line">        <tt class="py-comment"># TODO(shivaram): Similar to the scala implementation, update the take </tt> </tt>
<a name="L642"></a><tt class="py-lineno"> 642</tt>  <tt class="py-line">        <tt class="py-comment"># method to scan multiple splits based on an estimate of how many elements </tt> </tt>
<a name="L643"></a><tt class="py-lineno"> 643</tt>  <tt class="py-line">        <tt class="py-comment"># we have per-split.</tt> </tt>
<a name="L644"></a><tt class="py-lineno"> 644</tt>  <tt class="py-line">        <tt class="py-keyword">with</tt> <tt class="py-name">_JavaStackTrace</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-100" class="py-name"><a title="pyspark.context
pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-100', 'context', 'link-67');">context</a></tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">st</tt><tt class="py-op">:</tt> </tt>
<a name="L645"></a><tt class="py-lineno"> 645</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">partition</tt> <tt class="py-keyword">in</tt> <tt class="py-name">range</tt><tt class="py-op">(</tt><tt class="py-name">mapped</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt class="py-name">splits</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">size</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L646"></a><tt class="py-lineno"> 646</tt>  <tt class="py-line">                <tt class="py-name">partitionsToTake</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-101" class="py-name" targets="Variable pyspark.context.SparkContext._gateway=pyspark.context.SparkContext-class.html#_gateway"><a title="pyspark.context.SparkContext._gateway" class="py-name" href="#" onclick="return doclink('link-101', '_gateway', 'link-101');">_gateway</a></tt><tt class="py-op">.</tt><tt class="py-name">new_array</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-102" class="py-name" targets="Variable pyspark.context.SparkContext._jvm=pyspark.context.SparkContext-class.html#_jvm"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-102', '_jvm', 'link-102');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">int</tt><tt class="py-op">,</tt> <tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
<a name="L647"></a><tt class="py-lineno"> 647</tt>  <tt class="py-line">                <tt class="py-name">partitionsToTake</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">partition</tt> </tt>
<a name="L648"></a><tt class="py-lineno"> 648</tt>  <tt class="py-line">                <tt class="py-name">iterator</tt> <tt class="py-op">=</tt> <tt class="py-name">mapped</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt class="py-name">collectPartitions</tt><tt class="py-op">(</tt><tt class="py-name">partitionsToTake</tt><tt class="py-op">)</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">iterator</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L649"></a><tt class="py-lineno"> 649</tt>  <tt class="py-line">                <tt class="py-name">items</tt><tt class="py-op">.</tt><tt class="py-name">extend</tt><tt class="py-op">(</tt><tt class="py-name">mapped</tt><tt class="py-op">.</tt><tt class="py-name">_collect_iterator_through_file</tt><tt class="py-op">(</tt><tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L650"></a><tt class="py-lineno"> 650</tt>  <tt class="py-line">                <tt class="py-keyword">if</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;=</tt> <tt class="py-name">num</tt><tt class="py-op">:</tt> </tt>
<a name="L651"></a><tt class="py-lineno"> 651</tt>  <tt class="py-line">                    <tt class="py-keyword">break</tt> </tt>
<a name="L652"></a><tt class="py-lineno"> 652</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">items</tt><tt class="py-op">[</tt><tt class="py-op">:</tt><tt class="py-name">num</tt><tt class="py-op">]</tt> </tt>
</div><a name="L653"></a><tt class="py-lineno"> 653</tt>  <tt class="py-line"> </tt>
<a name="RDD.first"></a><div id="RDD.first-def"><a name="L654"></a><tt class="py-lineno"> 654</tt> <a class="py-toggle" href="#" id="RDD.first-toggle" onclick="return toggle('RDD.first');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#first">first</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.first-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.first-expanded"><a name="L655"></a><tt class="py-lineno"> 655</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L656"></a><tt class="py-lineno"> 656</tt>  <tt class="py-line"><tt class="py-docstring">        Return the first element in this RDD.</tt> </tt>
<a name="L657"></a><tt class="py-lineno"> 657</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L658"></a><tt class="py-lineno"> 658</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize([2, 3, 4]).first()</tt> </tt>
<a name="L659"></a><tt class="py-lineno"> 659</tt>  <tt class="py-line"><tt class="py-docstring">        2</tt> </tt>
<a name="L660"></a><tt class="py-lineno"> 660</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L661"></a><tt class="py-lineno"> 661</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-103" class="py-name" targets="Method pyspark.rdd.RDD.take()=pyspark.rdd.RDD-class.html#take"><a title="pyspark.rdd.RDD.take" class="py-name" href="#" onclick="return doclink('link-103', 'take', 'link-103');">take</a></tt><tt class="py-op">(</tt><tt class="py-number">1</tt><tt class="py-op">)</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt> </tt>
</div><a name="L662"></a><tt class="py-lineno"> 662</tt>  <tt class="py-line"> </tt>
<a name="RDD.saveAsTextFile"></a><div id="RDD.saveAsTextFile-def"><a name="L663"></a><tt class="py-lineno"> 663</tt> <a class="py-toggle" href="#" id="RDD.saveAsTextFile-toggle" onclick="return toggle('RDD.saveAsTextFile');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#saveAsTextFile">saveAsTextFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">path</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.saveAsTextFile-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.saveAsTextFile-expanded"><a name="L664"></a><tt class="py-lineno"> 664</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L665"></a><tt class="py-lineno"> 665</tt>  <tt class="py-line"><tt class="py-docstring">        Save this RDD as a text file, using string representations of elements.</tt> </tt>
<a name="L666"></a><tt class="py-lineno"> 666</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L667"></a><tt class="py-lineno"> 667</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; tempFile = NamedTemporaryFile(delete=True)</tt> </tt>
<a name="L668"></a><tt class="py-lineno"> 668</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; tempFile.close()</tt> </tt>
<a name="L669"></a><tt class="py-lineno"> 669</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sc.parallelize(range(10)).saveAsTextFile(tempFile.name)</tt> </tt>
<a name="L670"></a><tt class="py-lineno"> 670</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from fileinput import input</tt> </tt>
<a name="L671"></a><tt class="py-lineno"> 671</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from glob import glob</tt> </tt>
<a name="L672"></a><tt class="py-lineno"> 672</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))</tt> </tt>
<a name="L673"></a><tt class="py-lineno"> 673</tt>  <tt class="py-line"><tt class="py-docstring">        '0\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n'</tt> </tt>
<a name="L674"></a><tt class="py-lineno"> 674</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L675"></a><tt class="py-lineno"> 675</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L676"></a><tt class="py-lineno"> 676</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L677"></a><tt class="py-lineno"> 677</tt>  <tt class="py-line">                <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">basestring</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L678"></a><tt class="py-lineno"> 678</tt>  <tt class="py-line">                    <tt class="py-name">x</tt> <tt class="py-op">=</tt> <tt class="py-name">unicode</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
<a name="L679"></a><tt class="py-lineno"> 679</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">encode</tt><tt class="py-op">(</tt><tt class="py-string">"utf-8"</tt><tt class="py-op">)</tt> </tt>
</div><a name="L680"></a><tt class="py-lineno"> 680</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt> <tt class="py-op">=</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
<a name="L681"></a><tt class="py-lineno"> 681</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L682"></a><tt class="py-lineno"> 682</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-104" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-104', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-105" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-105', '_jvm', 'link-102');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">BytesToString</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-106" class="py-name" targets="Method pyspark.rdd.RDD.saveAsTextFile()=pyspark.rdd.RDD-class.html#saveAsTextFile"><a title="pyspark.rdd.RDD.saveAsTextFile" class="py-name" href="#" onclick="return doclink('link-106', 'saveAsTextFile', 'link-106');">saveAsTextFile</a></tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">)</tt> </tt>
</div><a name="L683"></a><tt class="py-lineno"> 683</tt>  <tt class="py-line"> </tt>
<a name="L684"></a><tt class="py-lineno"> 684</tt>  <tt class="py-line">    <tt class="py-comment"># Pair functions</tt> </tt>
<a name="L685"></a><tt class="py-lineno"> 685</tt>  <tt class="py-line"> </tt>
<a name="RDD.collectAsMap"></a><div id="RDD.collectAsMap-def"><a name="L686"></a><tt class="py-lineno"> 686</tt> <a class="py-toggle" href="#" id="RDD.collectAsMap-toggle" onclick="return toggle('RDD.collectAsMap');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#collectAsMap">collectAsMap</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.collectAsMap-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.collectAsMap-expanded"><a name="L687"></a><tt class="py-lineno"> 687</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L688"></a><tt class="py-lineno"> 688</tt>  <tt class="py-line"><tt class="py-docstring">        Return the key-value pairs in this RDD to the master as a dictionary.</tt> </tt>
<a name="L689"></a><tt class="py-lineno"> 689</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L690"></a><tt class="py-lineno"> 690</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()</tt> </tt>
<a name="L691"></a><tt class="py-lineno"> 691</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; m[1]</tt> </tt>
<a name="L692"></a><tt class="py-lineno"> 692</tt>  <tt class="py-line"><tt class="py-docstring">        2</tt> </tt>
<a name="L693"></a><tt class="py-lineno"> 693</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; m[3]</tt> </tt>
<a name="L694"></a><tt class="py-lineno"> 694</tt>  <tt class="py-line"><tt class="py-docstring">        4</tt> </tt>
<a name="L695"></a><tt class="py-lineno"> 695</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L696"></a><tt class="py-lineno"> 696</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">dict</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-107" class="py-name"><a title="pyspark.rdd.RDD.collect" class="py-name" href="#" onclick="return doclink('link-107', 'collect', 'link-36');">collect</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L697"></a><tt class="py-lineno"> 697</tt>  <tt class="py-line"> </tt>
<a name="RDD.reduceByKey"></a><div id="RDD.reduceByKey-def"><a name="L698"></a><tt class="py-lineno"> 698</tt> <a class="py-toggle" href="#" id="RDD.reduceByKey-toggle" onclick="return toggle('RDD.reduceByKey');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#reduceByKey">reduceByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">func</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.reduceByKey-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.reduceByKey-expanded"><a name="L699"></a><tt class="py-lineno"> 699</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L700"></a><tt class="py-lineno"> 700</tt>  <tt class="py-line"><tt class="py-docstring">        Merge the values for each key using an associative reduce function.</tt> </tt>
<a name="L701"></a><tt class="py-lineno"> 701</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L702"></a><tt class="py-lineno"> 702</tt>  <tt class="py-line"><tt class="py-docstring">        This will also perform the merging locally on each mapper before</tt> </tt>
<a name="L703"></a><tt class="py-lineno"> 703</tt>  <tt class="py-line"><tt class="py-docstring">        sending results to a reducer, similarly to a "combiner" in MapReduce.</tt> </tt>
<a name="L704"></a><tt class="py-lineno"> 704</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L705"></a><tt class="py-lineno"> 705</tt>  <tt class="py-line"><tt class="py-docstring">        Output will be hash-partitioned with C{numPartitions} partitions, or</tt> </tt>
<a name="L706"></a><tt class="py-lineno"> 706</tt>  <tt class="py-line"><tt class="py-docstring">        the default parallelism level if C{numPartitions} is not specified.</tt> </tt>
<a name="L707"></a><tt class="py-lineno"> 707</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L708"></a><tt class="py-lineno"> 708</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from operator import add</tt> </tt>
<a name="L709"></a><tt class="py-lineno"> 709</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
<a name="L710"></a><tt class="py-lineno"> 710</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.reduceByKey(add).collect())</tt> </tt>
<a name="L711"></a><tt class="py-lineno"> 711</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', 2), ('b', 1)]</tt> </tt>
<a name="L712"></a><tt class="py-lineno"> 712</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L713"></a><tt class="py-lineno"> 713</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-108" class="py-name" targets="Method pyspark.rdd.RDD.combineByKey()=pyspark.rdd.RDD-class.html#combineByKey"><a title="pyspark.rdd.RDD.combineByKey" class="py-name" href="#" onclick="return doclink('link-108', 'combineByKey', 'link-108');">combineByKey</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L714"></a><tt class="py-lineno"> 714</tt>  <tt class="py-line"> </tt>
<a name="RDD.reduceByKeyLocally"></a><div id="RDD.reduceByKeyLocally-def"><a name="L715"></a><tt class="py-lineno"> 715</tt> <a class="py-toggle" href="#" id="RDD.reduceByKeyLocally-toggle" onclick="return toggle('RDD.reduceByKeyLocally');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#reduceByKeyLocally">reduceByKeyLocally</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">func</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.reduceByKeyLocally-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.reduceByKeyLocally-expanded"><a name="L716"></a><tt class="py-lineno"> 716</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L717"></a><tt class="py-lineno"> 717</tt>  <tt class="py-line"><tt class="py-docstring">        Merge the values for each key using an associative reduce function, but</tt> </tt>
<a name="L718"></a><tt class="py-lineno"> 718</tt>  <tt class="py-line"><tt class="py-docstring">        return the results immediately to the master as a dictionary.</tt> </tt>
<a name="L719"></a><tt class="py-lineno"> 719</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L720"></a><tt class="py-lineno"> 720</tt>  <tt class="py-line"><tt class="py-docstring">        This will also perform the merging locally on each mapper before</tt> </tt>
<a name="L721"></a><tt class="py-lineno"> 721</tt>  <tt class="py-line"><tt class="py-docstring">        sending results to a reducer, similarly to a "combiner" in MapReduce.</tt> </tt>
<a name="L722"></a><tt class="py-lineno"> 722</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L723"></a><tt class="py-lineno"> 723</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from operator import add</tt> </tt>
<a name="L724"></a><tt class="py-lineno"> 724</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
<a name="L725"></a><tt class="py-lineno"> 725</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.reduceByKeyLocally(add).items())</tt> </tt>
<a name="L726"></a><tt class="py-lineno"> 726</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', 2), ('b', 1)]</tt> </tt>
<a name="L727"></a><tt class="py-lineno"> 727</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L728"></a><tt class="py-lineno"> 728</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">reducePartition</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L729"></a><tt class="py-lineno"> 729</tt>  <tt class="py-line">            <tt class="py-name">m</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt>
<a name="L730"></a><tt class="py-lineno"> 730</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L731"></a><tt class="py-lineno"> 731</tt>  <tt class="py-line">                <tt class="py-name">m</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">v</tt> <tt class="py-keyword">if</tt> <tt class="py-name">k</tt> <tt class="py-keyword">not</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m</tt> <tt class="py-keyword">else</tt> <tt class="py-name">func</tt><tt class="py-op">(</tt><tt class="py-name">m</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
<a name="L732"></a><tt class="py-lineno"> 732</tt>  <tt class="py-line">            <tt class="py-keyword">yield</tt> <tt class="py-name">m</tt> </tt>
</div><a name="L733"></a><tt class="py-lineno"> 733</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeMaps</tt><tt class="py-op">(</tt><tt class="py-param">m1</tt><tt class="py-op">,</tt> <tt class="py-param">m2</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L734"></a><tt class="py-lineno"> 734</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m2</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L735"></a><tt class="py-lineno"> 735</tt>  <tt class="py-line">                <tt class="py-name">m1</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">v</tt> <tt class="py-keyword">if</tt> <tt class="py-name">k</tt> <tt class="py-keyword">not</tt> <tt class="py-keyword">in</tt> <tt class="py-name">m1</tt> <tt class="py-keyword">else</tt> <tt class="py-name">func</tt><tt class="py-op">(</tt><tt class="py-name">m1</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
<a name="L736"></a><tt class="py-lineno"> 736</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">m1</tt> </tt>
</div><a name="L737"></a><tt class="py-lineno"> 737</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-109" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-109', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">reducePartition</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-110" class="py-name"><a title="pyspark.rdd.RDD.reduce" class="py-name" href="#" onclick="return doclink('link-110', 'reduce', 'link-72');">reduce</a></tt><tt class="py-op">(</tt><tt class="py-name">mergeMaps</tt><tt class="py-op">)</tt> </tt>
</div><a name="L738"></a><tt class="py-lineno"> 738</tt>  <tt class="py-line"> </tt>
<a name="RDD.countByKey"></a><div id="RDD.countByKey-def"><a name="L739"></a><tt class="py-lineno"> 739</tt> <a class="py-toggle" href="#" id="RDD.countByKey-toggle" onclick="return toggle('RDD.countByKey');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#countByKey">countByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.countByKey-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.countByKey-expanded"><a name="L740"></a><tt class="py-lineno"> 740</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L741"></a><tt class="py-lineno"> 741</tt>  <tt class="py-line"><tt class="py-docstring">        Count the number of elements for each key, and return the result to the</tt> </tt>
<a name="L742"></a><tt class="py-lineno"> 742</tt>  <tt class="py-line"><tt class="py-docstring">        master as a dictionary.</tt> </tt>
<a name="L743"></a><tt class="py-lineno"> 743</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L744"></a><tt class="py-lineno"> 744</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
<a name="L745"></a><tt class="py-lineno"> 745</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(rdd.countByKey().items())</tt> </tt>
<a name="L746"></a><tt class="py-lineno"> 746</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', 2), ('b', 1)]</tt> </tt>
<a name="L747"></a><tt class="py-lineno"> 747</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L748"></a><tt class="py-lineno"> 748</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-111" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-111', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-name">x</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-112" class="py-name" targets="Method pyspark.rdd.RDD.countByValue()=pyspark.rdd.RDD-class.html#countByValue"><a title="pyspark.rdd.RDD.countByValue" class="py-name" href="#" onclick="return doclink('link-112', 'countByValue', 'link-112');">countByValue</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L749"></a><tt class="py-lineno"> 749</tt>  <tt class="py-line"> </tt>
<a name="RDD.join"></a><div id="RDD.join-def"><a name="L750"></a><tt class="py-lineno"> 750</tt> <a class="py-toggle" href="#" id="RDD.join-toggle" onclick="return toggle('RDD.join');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#join">join</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.join-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.join-expanded"><a name="L751"></a><tt class="py-lineno"> 751</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L752"></a><tt class="py-lineno"> 752</tt>  <tt class="py-line"><tt class="py-docstring">        Return an RDD containing all pairs of elements with matching keys in</tt> </tt>
<a name="L753"></a><tt class="py-lineno"> 753</tt>  <tt class="py-line"><tt class="py-docstring">        C{self} and C{other}.</tt> </tt>
<a name="L754"></a><tt class="py-lineno"> 754</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L755"></a><tt class="py-lineno"> 755</tt>  <tt class="py-line"><tt class="py-docstring">        Each pair of elements will be returned as a (k, (v1, v2)) tuple, where</tt> </tt>
<a name="L756"></a><tt class="py-lineno"> 756</tt>  <tt class="py-line"><tt class="py-docstring">        (k, v1) is in C{self} and (k, v2) is in C{other}.</tt> </tt>
<a name="L757"></a><tt class="py-lineno"> 757</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L758"></a><tt class="py-lineno"> 758</tt>  <tt class="py-line"><tt class="py-docstring">        Performs a hash join across the cluster.</tt> </tt>
<a name="L759"></a><tt class="py-lineno"> 759</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L760"></a><tt class="py-lineno"> 760</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
<a name="L761"></a><tt class="py-lineno"> 761</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize([("a", 2), ("a", 3)])</tt> </tt>
<a name="L762"></a><tt class="py-lineno"> 762</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.join(y).collect())</tt> </tt>
<a name="L763"></a><tt class="py-lineno"> 763</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', (1, 2)), ('a', (1, 3))]</tt> </tt>
<a name="L764"></a><tt class="py-lineno"> 764</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L765"></a><tt class="py-lineno"> 765</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">python_join</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L766"></a><tt class="py-lineno"> 766</tt>  <tt class="py-line"> </tt>
<a name="RDD.leftOuterJoin"></a><div id="RDD.leftOuterJoin-def"><a name="L767"></a><tt class="py-lineno"> 767</tt> <a class="py-toggle" href="#" id="RDD.leftOuterJoin-toggle" onclick="return toggle('RDD.leftOuterJoin');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#leftOuterJoin">leftOuterJoin</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.leftOuterJoin-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.leftOuterJoin-expanded"><a name="L768"></a><tt class="py-lineno"> 768</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L769"></a><tt class="py-lineno"> 769</tt>  <tt class="py-line"><tt class="py-docstring">        Perform a left outer join of C{self} and C{other}.</tt> </tt>
<a name="L770"></a><tt class="py-lineno"> 770</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L771"></a><tt class="py-lineno"> 771</tt>  <tt class="py-line"><tt class="py-docstring">        For each element (k, v) in C{self}, the resulting RDD will either</tt> </tt>
<a name="L772"></a><tt class="py-lineno"> 772</tt>  <tt class="py-line"><tt class="py-docstring">        contain all pairs (k, (v, w)) for w in C{other}, or the pair</tt> </tt>
<a name="L773"></a><tt class="py-lineno"> 773</tt>  <tt class="py-line"><tt class="py-docstring">        (k, (v, None)) if no elements in other have key k.</tt> </tt>
<a name="L774"></a><tt class="py-lineno"> 774</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L775"></a><tt class="py-lineno"> 775</tt>  <tt class="py-line"><tt class="py-docstring">        Hash-partitions the resulting RDD into the given number of partitions.</tt> </tt>
<a name="L776"></a><tt class="py-lineno"> 776</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L777"></a><tt class="py-lineno"> 777</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
<a name="L778"></a><tt class="py-lineno"> 778</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize([("a", 2)])</tt> </tt>
<a name="L779"></a><tt class="py-lineno"> 779</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.leftOuterJoin(y).collect())</tt> </tt>
<a name="L780"></a><tt class="py-lineno"> 780</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', (1, 2)), ('b', (4, None))]</tt> </tt>
<a name="L781"></a><tt class="py-lineno"> 781</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L782"></a><tt class="py-lineno"> 782</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">python_left_outer_join</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L783"></a><tt class="py-lineno"> 783</tt>  <tt class="py-line"> </tt>
<a name="RDD.rightOuterJoin"></a><div id="RDD.rightOuterJoin-def"><a name="L784"></a><tt class="py-lineno"> 784</tt> <a class="py-toggle" href="#" id="RDD.rightOuterJoin-toggle" onclick="return toggle('RDD.rightOuterJoin');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#rightOuterJoin">rightOuterJoin</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.rightOuterJoin-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.rightOuterJoin-expanded"><a name="L785"></a><tt class="py-lineno"> 785</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L786"></a><tt class="py-lineno"> 786</tt>  <tt class="py-line"><tt class="py-docstring">        Perform a right outer join of C{self} and C{other}.</tt> </tt>
<a name="L787"></a><tt class="py-lineno"> 787</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L788"></a><tt class="py-lineno"> 788</tt>  <tt class="py-line"><tt class="py-docstring">        For each element (k, w) in C{other}, the resulting RDD will either</tt> </tt>
<a name="L789"></a><tt class="py-lineno"> 789</tt>  <tt class="py-line"><tt class="py-docstring">        contain all pairs (k, (v, w)) for v in this, or the pair (k, (None, w))</tt> </tt>
<a name="L790"></a><tt class="py-lineno"> 790</tt>  <tt class="py-line"><tt class="py-docstring">        if no elements in C{self} have key k.</tt> </tt>
<a name="L791"></a><tt class="py-lineno"> 791</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L792"></a><tt class="py-lineno"> 792</tt>  <tt class="py-line"><tt class="py-docstring">        Hash-partitions the resulting RDD into the given number of partitions.</tt> </tt>
<a name="L793"></a><tt class="py-lineno"> 793</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L794"></a><tt class="py-lineno"> 794</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
<a name="L795"></a><tt class="py-lineno"> 795</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize([("a", 2)])</tt> </tt>
<a name="L796"></a><tt class="py-lineno"> 796</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(y.rightOuterJoin(x).collect())</tt> </tt>
<a name="L797"></a><tt class="py-lineno"> 797</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', (2, 1)), ('b', (None, 4))]</tt> </tt>
<a name="L798"></a><tt class="py-lineno"> 798</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L799"></a><tt class="py-lineno"> 799</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">python_right_outer_join</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L800"></a><tt class="py-lineno"> 800</tt>  <tt class="py-line"> </tt>
<a name="L801"></a><tt class="py-lineno"> 801</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: add option to control map-side combining</tt> </tt>
<a name="RDD.partitionBy"></a><div id="RDD.partitionBy-def"><a name="L802"></a><tt class="py-lineno"> 802</tt> <a class="py-toggle" href="#" id="RDD.partitionBy-toggle" onclick="return toggle('RDD.partitionBy');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#partitionBy">partitionBy</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">,</tt> <tt class="py-param">partitionFunc</tt><tt class="py-op">=</tt><tt class="py-name">hash</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.partitionBy-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.partitionBy-expanded"><a name="L803"></a><tt class="py-lineno"> 803</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L804"></a><tt class="py-lineno"> 804</tt>  <tt class="py-line"><tt class="py-docstring">        Return a copy of the RDD partitioned using the specified partitioner.</tt> </tt>
<a name="L805"></a><tt class="py-lineno"> 805</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L806"></a><tt class="py-lineno"> 806</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x: (x, x))</tt> </tt>
<a name="L807"></a><tt class="py-lineno"> 807</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sets = pairs.partitionBy(2).glom().collect()</tt> </tt>
<a name="L808"></a><tt class="py-lineno"> 808</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; set(sets[0]).intersection(set(sets[1]))</tt> </tt>
<a name="L809"></a><tt class="py-lineno"> 809</tt>  <tt class="py-line"><tt class="py-docstring">        set([])</tt> </tt>
<a name="L810"></a><tt class="py-lineno"> 810</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L811"></a><tt class="py-lineno"> 811</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
<a name="L812"></a><tt class="py-lineno"> 812</tt>  <tt class="py-line">            <tt class="py-name">numPartitions</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-113" class="py-name"><a title="pyspark.context.SparkContext.defaultParallelism" class="py-name" href="#" onclick="return doclink('link-113', 'defaultParallelism', 'link-48');">defaultParallelism</a></tt> </tt>
<a name="L813"></a><tt class="py-lineno"> 813</tt>  <tt class="py-line">        <tt class="py-comment"># Transferring O(n) objects to Java is too expensive.  Instead, we'll</tt> </tt>
<a name="L814"></a><tt class="py-lineno"> 814</tt>  <tt class="py-line">        <tt class="py-comment"># form the hash buckets in Python, transferring O(numPartitions) objects</tt> </tt>
<a name="L815"></a><tt class="py-lineno"> 815</tt>  <tt class="py-line">        <tt class="py-comment"># to Java.  Each object is a (splitNumber, [objects]) pair.</tt> </tt>
<a name="L816"></a><tt class="py-lineno"> 816</tt>  <tt class="py-line">        <tt class="py-name">outputSerializer</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_unbatched_serializer</tt> </tt>
<a name="L817"></a><tt class="py-lineno"> 817</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">add_shuffle_key</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L818"></a><tt class="py-lineno"> 818</tt>  <tt class="py-line"> </tt>
<a name="L819"></a><tt class="py-lineno"> 819</tt>  <tt class="py-line">            <tt class="py-name">buckets</tt> <tt class="py-op">=</tt> <tt class="py-name">defaultdict</tt><tt class="py-op">(</tt><tt class="py-name">list</tt><tt class="py-op">)</tt> </tt>
<a name="L820"></a><tt class="py-lineno"> 820</tt>  <tt class="py-line"> </tt>
<a name="L821"></a><tt class="py-lineno"> 821</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L822"></a><tt class="py-lineno"> 822</tt>  <tt class="py-line">                <tt class="py-name">buckets</tt><tt class="py-op">[</tt><tt class="py-name">partitionFunc</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">)</tt> <tt class="py-op">%</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L823"></a><tt class="py-lineno"> 823</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">items</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">buckets</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L824"></a><tt class="py-lineno"> 824</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">pack_long</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">)</tt> </tt>
<a name="L825"></a><tt class="py-lineno"> 825</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">outputSerializer</tt><tt class="py-op">.</tt><tt id="link-114" class="py-name" targets="Variable pyspark.serializers.MarshalSerializer.dumps=pyspark.serializers.MarshalSerializer-class.html#dumps,Method pyspark.serializers.PickleSerializer.dumps()=pyspark.serializers.PickleSerializer-class.html#dumps"><a title="pyspark.serializers.MarshalSerializer.dumps
pyspark.serializers.PickleSerializer.dumps" class="py-name" href="#" onclick="return doclink('link-114', 'dumps', 'link-114');">dumps</a></tt><tt class="py-op">(</tt><tt class="py-name">items</tt><tt class="py-op">)</tt> </tt>
</div><a name="L826"></a><tt class="py-lineno"> 826</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt> <tt class="py-op">=</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">add_shuffle_key</tt><tt class="py-op">)</tt> </tt>
<a name="L827"></a><tt class="py-lineno"> 827</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L828"></a><tt class="py-lineno"> 828</tt>  <tt class="py-line">        <tt class="py-keyword">with</tt> <tt class="py-name">_JavaStackTrace</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-115" class="py-name"><a title="pyspark.context
pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-115', 'context', 'link-67');">context</a></tt><tt class="py-op">)</tt> <tt class="py-keyword">as</tt> <tt class="py-name">st</tt><tt class="py-op">:</tt> </tt>
<a name="L829"></a><tt class="py-lineno"> 829</tt>  <tt class="py-line">            <tt class="py-name">pairRDD</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-116" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-116', '_jvm', 'link-102');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PairwiseRDD</tt><tt class="py-op">(</tt><tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-117" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-117', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">asJavaPairRDD</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L830"></a><tt class="py-lineno"> 830</tt>  <tt class="py-line">            <tt class="py-name">partitioner</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-118" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-118', '_jvm', 'link-102');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PythonPartitioner</tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">,</tt> </tt>
<a name="L831"></a><tt class="py-lineno"> 831</tt>  <tt class="py-line">                                                          <tt class="py-name">id</tt><tt class="py-op">(</tt><tt class="py-name">partitionFunc</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L832"></a><tt class="py-lineno"> 832</tt>  <tt class="py-line">        <tt class="py-name">jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">pairRDD</tt><tt class="py-op">.</tt><tt id="link-119" class="py-name"><a title="pyspark.rdd.RDD.partitionBy" class="py-name" href="#" onclick="return doclink('link-119', 'partitionBy', 'link-53');">partitionBy</a></tt><tt class="py-op">(</tt><tt class="py-name">partitioner</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">values</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L833"></a><tt class="py-lineno"> 833</tt>  <tt class="py-line">        <tt id="link-120" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-120', 'rdd', 'link-19');">rdd</a></tt> <tt class="py-op">=</tt> <tt id="link-121" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-121', 'RDD', 'link-40');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jrdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">,</tt> <tt class="py-name">BatchedSerializer</tt><tt class="py-op">(</tt><tt class="py-name">outputSerializer</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L834"></a><tt class="py-lineno"> 834</tt>  <tt class="py-line">        <tt class="py-comment"># This is required so that id(partitionFunc) remains unique, even if</tt> </tt>
<a name="L835"></a><tt class="py-lineno"> 835</tt>  <tt class="py-line">        <tt class="py-comment"># partitionFunc is a lambda:</tt> </tt>
<a name="L836"></a><tt class="py-lineno"> 836</tt>  <tt class="py-line">        <tt id="link-122" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-122', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">.</tt><tt class="py-name">_partitionFunc</tt> <tt class="py-op">=</tt> <tt class="py-name">partitionFunc</tt> </tt>
<a name="L837"></a><tt class="py-lineno"> 837</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-123" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-123', 'rdd', 'link-19');">rdd</a></tt> </tt>
</div><a name="L838"></a><tt class="py-lineno"> 838</tt>  <tt class="py-line"> </tt>
<a name="L839"></a><tt class="py-lineno"> 839</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: add control over map-side aggregation</tt> </tt>
<a name="RDD.combineByKey"></a><div id="RDD.combineByKey-def"><a name="L840"></a><tt class="py-lineno"> 840</tt> <a class="py-toggle" href="#" id="RDD.combineByKey-toggle" onclick="return toggle('RDD.combineByKey');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#combineByKey">combineByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">createCombiner</tt><tt class="py-op">,</tt> <tt class="py-param">mergeValue</tt><tt class="py-op">,</tt> <tt class="py-param">mergeCombiners</tt><tt class="py-op">,</tt> </tt>
<a name="L841"></a><tt class="py-lineno"> 841</tt>  <tt class="py-line">                     <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.combineByKey-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.combineByKey-expanded"><a name="L842"></a><tt class="py-lineno"> 842</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L843"></a><tt class="py-lineno"> 843</tt>  <tt class="py-line"><tt class="py-docstring">        Generic function to combine the elements for each key using a custom</tt> </tt>
<a name="L844"></a><tt class="py-lineno"> 844</tt>  <tt class="py-line"><tt class="py-docstring">        set of aggregation functions.</tt> </tt>
<a name="L845"></a><tt class="py-lineno"> 845</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L846"></a><tt class="py-lineno"> 846</tt>  <tt class="py-line"><tt class="py-docstring">        Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined</tt> </tt>
<a name="L847"></a><tt class="py-lineno"> 847</tt>  <tt class="py-line"><tt class="py-docstring">        type" C.  Note that V and C can be different -- for example, one might</tt> </tt>
<a name="L848"></a><tt class="py-lineno"> 848</tt>  <tt class="py-line"><tt class="py-docstring">        group an RDD of type (Int, Int) into an RDD of type (Int, List[Int]).</tt> </tt>
<a name="L849"></a><tt class="py-lineno"> 849</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L850"></a><tt class="py-lineno"> 850</tt>  <tt class="py-line"><tt class="py-docstring">        Users provide three functions:</tt> </tt>
<a name="L851"></a><tt class="py-lineno"> 851</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L852"></a><tt class="py-lineno"> 852</tt>  <tt class="py-line"><tt class="py-docstring">            - C{createCombiner}, which turns a V into a C (e.g., creates</tt> </tt>
<a name="L853"></a><tt class="py-lineno"> 853</tt>  <tt class="py-line"><tt class="py-docstring">              a one-element list)</tt> </tt>
<a name="L854"></a><tt class="py-lineno"> 854</tt>  <tt class="py-line"><tt class="py-docstring">            - C{mergeValue}, to merge a V into a C (e.g., adds it to the end of</tt> </tt>
<a name="L855"></a><tt class="py-lineno"> 855</tt>  <tt class="py-line"><tt class="py-docstring">              a list)</tt> </tt>
<a name="L856"></a><tt class="py-lineno"> 856</tt>  <tt class="py-line"><tt class="py-docstring">            - C{mergeCombiners}, to combine two C's into a single one.</tt> </tt>
<a name="L857"></a><tt class="py-lineno"> 857</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L858"></a><tt class="py-lineno"> 858</tt>  <tt class="py-line"><tt class="py-docstring">        In addition, users can control the partitioning of the output RDD.</tt> </tt>
<a name="L859"></a><tt class="py-lineno"> 859</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L860"></a><tt class="py-lineno"> 860</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
<a name="L861"></a><tt class="py-lineno"> 861</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; def f(x): return x</tt> </tt>
<a name="L862"></a><tt class="py-lineno"> 862</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; def add(a, b): return a + str(b)</tt> </tt>
<a name="L863"></a><tt class="py-lineno"> 863</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.combineByKey(str, add, add).collect())</tt> </tt>
<a name="L864"></a><tt class="py-lineno"> 864</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', '11'), ('b', '1')]</tt> </tt>
<a name="L865"></a><tt class="py-lineno"> 865</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L866"></a><tt class="py-lineno"> 866</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
<a name="L867"></a><tt class="py-lineno"> 867</tt>  <tt class="py-line">            <tt class="py-name">numPartitions</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-124" class="py-name"><a title="pyspark.context.SparkContext.defaultParallelism" class="py-name" href="#" onclick="return doclink('link-124', 'defaultParallelism', 'link-48');">defaultParallelism</a></tt> </tt>
<a name="L868"></a><tt class="py-lineno"> 868</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">combineLocally</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L869"></a><tt class="py-lineno"> 869</tt>  <tt class="py-line">            <tt class="py-name">combiners</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt>
<a name="L870"></a><tt class="py-lineno"> 870</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L871"></a><tt class="py-lineno"> 871</tt>  <tt class="py-line">                <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-op">=</tt> <tt class="py-name">x</tt> </tt>
<a name="L872"></a><tt class="py-lineno"> 872</tt>  <tt class="py-line">                <tt class="py-keyword">if</tt> <tt class="py-name">k</tt> <tt class="py-keyword">not</tt> <tt class="py-keyword">in</tt> <tt class="py-name">combiners</tt><tt class="py-op">:</tt> </tt>
<a name="L873"></a><tt class="py-lineno"> 873</tt>  <tt class="py-line">                    <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">createCombiner</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
<a name="L874"></a><tt class="py-lineno"> 874</tt>  <tt class="py-line">                <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L875"></a><tt class="py-lineno"> 875</tt>  <tt class="py-line">                    <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">mergeValue</tt><tt class="py-op">(</tt><tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
<a name="L876"></a><tt class="py-lineno"> 876</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">combiners</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L877"></a><tt class="py-lineno"> 877</tt>  <tt class="py-line">        <tt class="py-name">locally_combined</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-125" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-125', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">combineLocally</tt><tt class="py-op">)</tt> </tt>
<a name="L878"></a><tt class="py-lineno"> 878</tt>  <tt class="py-line">        <tt class="py-name">shuffled</tt> <tt class="py-op">=</tt> <tt class="py-name">locally_combined</tt><tt class="py-op">.</tt><tt id="link-126" class="py-name"><a title="pyspark.rdd.RDD.partitionBy" class="py-name" href="#" onclick="return doclink('link-126', 'partitionBy', 'link-53');">partitionBy</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
<a name="L879"></a><tt class="py-lineno"> 879</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">_mergeCombiners</tt><tt class="py-op">(</tt><tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L880"></a><tt class="py-lineno"> 880</tt>  <tt class="py-line">            <tt class="py-name">combiners</tt> <tt class="py-op">=</tt> <tt class="py-op">{</tt><tt class="py-op">}</tt> </tt>
<a name="L881"></a><tt class="py-lineno"> 881</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L882"></a><tt class="py-lineno"> 882</tt>  <tt class="py-line">                <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">k</tt> <tt class="py-keyword">in</tt> <tt class="py-name">combiners</tt><tt class="py-op">:</tt> </tt>
<a name="L883"></a><tt class="py-lineno"> 883</tt>  <tt class="py-line">                    <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">v</tt> </tt>
<a name="L884"></a><tt class="py-lineno"> 884</tt>  <tt class="py-line">                <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L885"></a><tt class="py-lineno"> 885</tt>  <tt class="py-line">                    <tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">mergeCombiners</tt><tt class="py-op">(</tt><tt class="py-name">combiners</tt><tt class="py-op">[</tt><tt class="py-name">k</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt> </tt>
<a name="L886"></a><tt class="py-lineno"> 886</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">combiners</tt><tt class="py-op">.</tt><tt class="py-name">iteritems</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L887"></a><tt class="py-lineno"> 887</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">shuffled</tt><tt class="py-op">.</tt><tt id="link-127" class="py-name"><a title="pyspark.rdd.RDD.mapPartitions" class="py-name" href="#" onclick="return doclink('link-127', 'mapPartitions', 'link-29');">mapPartitions</a></tt><tt class="py-op">(</tt><tt class="py-name">_mergeCombiners</tt><tt class="py-op">)</tt> </tt>
</div><a name="L888"></a><tt class="py-lineno"> 888</tt>  <tt class="py-line"> </tt>
<a name="L889"></a><tt class="py-lineno"> 889</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: support variant with custom partitioner</tt> </tt>
<a name="RDD.groupByKey"></a><div id="RDD.groupByKey-def"><a name="L890"></a><tt class="py-lineno"> 890</tt> <a class="py-toggle" href="#" id="RDD.groupByKey-toggle" onclick="return toggle('RDD.groupByKey');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#groupByKey">groupByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.groupByKey-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.groupByKey-expanded"><a name="L891"></a><tt class="py-lineno"> 891</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L892"></a><tt class="py-lineno"> 892</tt>  <tt class="py-line"><tt class="py-docstring">        Group the values for each key in the RDD into a single sequence.</tt> </tt>
<a name="L893"></a><tt class="py-lineno"> 893</tt>  <tt class="py-line"><tt class="py-docstring">        Hash-partitions the resulting RDD with into numPartitions partitions.</tt> </tt>
<a name="L894"></a><tt class="py-lineno"> 894</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L895"></a><tt class="py-lineno"> 895</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])</tt> </tt>
<a name="L896"></a><tt class="py-lineno"> 896</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.groupByKey().collect())</tt> </tt>
<a name="L897"></a><tt class="py-lineno"> 897</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', [1, 1]), ('b', [1])]</tt> </tt>
<a name="L898"></a><tt class="py-lineno"> 898</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L899"></a><tt class="py-lineno"> 899</tt>  <tt class="py-line"> </tt>
<a name="L900"></a><tt class="py-lineno"> 900</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">createCombiner</tt><tt class="py-op">(</tt><tt class="py-param">x</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L901"></a><tt class="py-lineno"> 901</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-op">[</tt><tt class="py-name">x</tt><tt class="py-op">]</tt> </tt>
</div><a name="L902"></a><tt class="py-lineno"> 902</tt>  <tt class="py-line"> </tt>
<a name="L903"></a><tt class="py-lineno"> 903</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeValue</tt><tt class="py-op">(</tt><tt class="py-param">xs</tt><tt class="py-op">,</tt> <tt class="py-param">x</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L904"></a><tt class="py-lineno"> 904</tt>  <tt class="py-line">            <tt class="py-name">xs</tt><tt class="py-op">.</tt><tt class="py-name">append</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
<a name="L905"></a><tt class="py-lineno"> 905</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">xs</tt> </tt>
</div><a name="L906"></a><tt class="py-lineno"> 906</tt>  <tt class="py-line"> </tt>
<a name="L907"></a><tt class="py-lineno"> 907</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">mergeCombiners</tt><tt class="py-op">(</tt><tt class="py-param">a</tt><tt class="py-op">,</tt> <tt class="py-param">b</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L908"></a><tt class="py-lineno"> 908</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">a</tt> <tt class="py-op">+</tt> <tt class="py-name">b</tt> </tt>
</div><a name="L909"></a><tt class="py-lineno"> 909</tt>  <tt class="py-line"> </tt>
<a name="L910"></a><tt class="py-lineno"> 910</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-128" class="py-name"><a title="pyspark.rdd.RDD.combineByKey" class="py-name" href="#" onclick="return doclink('link-128', 'combineByKey', 'link-108');">combineByKey</a></tt><tt class="py-op">(</tt><tt class="py-name">createCombiner</tt><tt class="py-op">,</tt> <tt class="py-name">mergeValue</tt><tt class="py-op">,</tt> <tt class="py-name">mergeCombiners</tt><tt class="py-op">,</tt> </tt>
<a name="L911"></a><tt class="py-lineno"> 911</tt>  <tt class="py-line">                <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L912"></a><tt class="py-lineno"> 912</tt>  <tt class="py-line"> </tt>
<a name="L913"></a><tt class="py-lineno"> 913</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: add tests</tt> </tt>
<a name="RDD.flatMapValues"></a><div id="RDD.flatMapValues-def"><a name="L914"></a><tt class="py-lineno"> 914</tt> <a class="py-toggle" href="#" id="RDD.flatMapValues-toggle" onclick="return toggle('RDD.flatMapValues');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#flatMapValues">flatMapValues</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.flatMapValues-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.flatMapValues-expanded"><a name="L915"></a><tt class="py-lineno"> 915</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L916"></a><tt class="py-lineno"> 916</tt>  <tt class="py-line"><tt class="py-docstring">        Pass each value in the key-value pair RDD through a flatMap function</tt> </tt>
<a name="L917"></a><tt class="py-lineno"> 917</tt>  <tt class="py-line"><tt class="py-docstring">        without changing the keys; this also retains the original RDD's</tt> </tt>
<a name="L918"></a><tt class="py-lineno"> 918</tt>  <tt class="py-line"><tt class="py-docstring">        partitioning.</tt> </tt>
<a name="L919"></a><tt class="py-lineno"> 919</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L920"></a><tt class="py-lineno"> 920</tt>  <tt class="py-line">        <tt class="py-name">flat_map_fn</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L921"></a><tt class="py-lineno"> 921</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-129" class="py-name"><a title="pyspark.rdd.RDD.flatMap" class="py-name" href="#" onclick="return doclink('link-129', 'flatMap', 'link-55');">flatMap</a></tt><tt class="py-op">(</tt><tt class="py-name">flat_map_fn</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
</div><a name="L922"></a><tt class="py-lineno"> 922</tt>  <tt class="py-line"> </tt>
<a name="RDD.mapValues"></a><div id="RDD.mapValues-def"><a name="L923"></a><tt class="py-lineno"> 923</tt> <a class="py-toggle" href="#" id="RDD.mapValues-toggle" onclick="return toggle('RDD.mapValues');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#mapValues">mapValues</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.mapValues-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.mapValues-expanded"><a name="L924"></a><tt class="py-lineno"> 924</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L925"></a><tt class="py-lineno"> 925</tt>  <tt class="py-line"><tt class="py-docstring">        Pass each value in the key-value pair RDD through a map function</tt> </tt>
<a name="L926"></a><tt class="py-lineno"> 926</tt>  <tt class="py-line"><tt class="py-docstring">        without changing the keys; this also retains the original RDD's</tt> </tt>
<a name="L927"></a><tt class="py-lineno"> 927</tt>  <tt class="py-line"><tt class="py-docstring">        partitioning.</tt> </tt>
<a name="L928"></a><tt class="py-lineno"> 928</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L929"></a><tt class="py-lineno"> 929</tt>  <tt class="py-line">        <tt class="py-name">map_values_fn</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">k</tt><tt class="py-op">,</tt> <tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">v</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L930"></a><tt class="py-lineno"> 930</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-130" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-130', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-name">map_values_fn</tt><tt class="py-op">,</tt> <tt class="py-name">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">True</tt><tt class="py-op">)</tt> </tt>
</div><a name="L931"></a><tt class="py-lineno"> 931</tt>  <tt class="py-line"> </tt>
<a name="L932"></a><tt class="py-lineno"> 932</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: support varargs cogroup of several RDDs.</tt> </tt>
<a name="RDD.groupWith"></a><div id="RDD.groupWith-def"><a name="L933"></a><tt class="py-lineno"> 933</tt> <a class="py-toggle" href="#" id="RDD.groupWith-toggle" onclick="return toggle('RDD.groupWith');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#groupWith">groupWith</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.groupWith-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.groupWith-expanded"><a name="L934"></a><tt class="py-lineno"> 934</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L935"></a><tt class="py-lineno"> 935</tt>  <tt class="py-line"><tt class="py-docstring">        Alias for cogroup.</tt> </tt>
<a name="L936"></a><tt class="py-lineno"> 936</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L937"></a><tt class="py-lineno"> 937</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-131" class="py-name" targets="Method pyspark.rdd.RDD.cogroup()=pyspark.rdd.RDD-class.html#cogroup"><a title="pyspark.rdd.RDD.cogroup" class="py-name" href="#" onclick="return doclink('link-131', 'cogroup', 'link-131');">cogroup</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">)</tt> </tt>
</div><a name="L938"></a><tt class="py-lineno"> 938</tt>  <tt class="py-line"> </tt>
<a name="L939"></a><tt class="py-lineno"> 939</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: add variant with custom parittioner</tt> </tt>
<a name="RDD.cogroup"></a><div id="RDD.cogroup-def"><a name="L940"></a><tt class="py-lineno"> 940</tt> <a class="py-toggle" href="#" id="RDD.cogroup-toggle" onclick="return toggle('RDD.cogroup');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#cogroup">cogroup</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.cogroup-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.cogroup-expanded"><a name="L941"></a><tt class="py-lineno"> 941</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L942"></a><tt class="py-lineno"> 942</tt>  <tt class="py-line"><tt class="py-docstring">        For each key k in C{self} or C{other}, return a resulting RDD that</tt> </tt>
<a name="L943"></a><tt class="py-lineno"> 943</tt>  <tt class="py-line"><tt class="py-docstring">        contains a tuple with the list of values for that key in C{self} as well</tt> </tt>
<a name="L944"></a><tt class="py-lineno"> 944</tt>  <tt class="py-line"><tt class="py-docstring">        as C{other}.</tt> </tt>
<a name="L945"></a><tt class="py-lineno"> 945</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L946"></a><tt class="py-lineno"> 946</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4)])</tt> </tt>
<a name="L947"></a><tt class="py-lineno"> 947</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize([("a", 2)])</tt> </tt>
<a name="L948"></a><tt class="py-lineno"> 948</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.cogroup(y).collect())</tt> </tt>
<a name="L949"></a><tt class="py-lineno"> 949</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', ([1], [2])), ('b', ([4], []))]</tt> </tt>
<a name="L950"></a><tt class="py-lineno"> 950</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L951"></a><tt class="py-lineno"> 951</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">python_cogroup</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
</div><a name="L952"></a><tt class="py-lineno"> 952</tt>  <tt class="py-line"> </tt>
<a name="RDD.subtractByKey"></a><div id="RDD.subtractByKey-def"><a name="L953"></a><tt class="py-lineno"> 953</tt> <a class="py-toggle" href="#" id="RDD.subtractByKey-toggle" onclick="return toggle('RDD.subtractByKey');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#subtractByKey">subtractByKey</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.subtractByKey-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.subtractByKey-expanded"><a name="L954"></a><tt class="py-lineno"> 954</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L955"></a><tt class="py-lineno"> 955</tt>  <tt class="py-line"><tt class="py-docstring">        Return each (key, value) pair in C{self} that has no pair with matching key</tt> </tt>
<a name="L956"></a><tt class="py-lineno"> 956</tt>  <tt class="py-line"><tt class="py-docstring">        in C{other}.</tt> </tt>
<a name="L957"></a><tt class="py-lineno"> 957</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L958"></a><tt class="py-lineno"> 958</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 2)])</tt> </tt>
<a name="L959"></a><tt class="py-lineno"> 959</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize([("a", 3), ("c", None)])</tt> </tt>
<a name="L960"></a><tt class="py-lineno"> 960</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.subtractByKey(y).collect())</tt> </tt>
<a name="L961"></a><tt class="py-lineno"> 961</tt>  <tt class="py-line"><tt class="py-docstring">        [('b', 4), ('b', 5)]</tt> </tt>
<a name="L962"></a><tt class="py-lineno"> 962</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L963"></a><tt class="py-lineno"> 963</tt>  <tt class="py-line">        <tt class="py-name">filter_func</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">key</tt><tt class="py-op">,</tt> <tt class="py-name">vals</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">vals</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> <tt class="py-op">&gt;</tt> <tt class="py-number">0</tt> <tt class="py-keyword">and</tt> <tt class="py-name">len</tt><tt class="py-op">(</tt><tt class="py-name">vals</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> <tt class="py-op">==</tt> <tt class="py-number">0</tt> </tt>
<a name="L964"></a><tt class="py-lineno"> 964</tt>  <tt class="py-line">        <tt class="py-name">map_func</tt> <tt class="py-op">=</tt> <tt class="py-keyword">lambda</tt> <tt class="py-op">(</tt><tt class="py-name">key</tt><tt class="py-op">,</tt> <tt class="py-name">vals</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> <tt class="py-op">[</tt><tt class="py-op">(</tt><tt class="py-name">key</tt><tt class="py-op">,</tt> <tt class="py-name">val</tt><tt class="py-op">)</tt> <tt class="py-keyword">for</tt> <tt class="py-name">val</tt> <tt class="py-keyword">in</tt> <tt class="py-name">vals</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">]</tt> </tt>
<a name="L965"></a><tt class="py-lineno"> 965</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-132" class="py-name"><a title="pyspark.rdd.RDD.cogroup" class="py-name" href="#" onclick="return doclink('link-132', 'cogroup', 'link-131');">cogroup</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-133" class="py-name" targets="Method pyspark.rdd.RDD.filter()=pyspark.rdd.RDD-class.html#filter"><a title="pyspark.rdd.RDD.filter" class="py-name" href="#" onclick="return doclink('link-133', 'filter', 'link-133');">filter</a></tt><tt class="py-op">(</tt><tt class="py-name">filter_func</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-134" class="py-name"><a title="pyspark.rdd.RDD.flatMap" class="py-name" href="#" onclick="return doclink('link-134', 'flatMap', 'link-55');">flatMap</a></tt><tt class="py-op">(</tt><tt class="py-name">map_func</tt><tt class="py-op">)</tt> </tt>
</div><a name="L966"></a><tt class="py-lineno"> 966</tt>  <tt class="py-line"> </tt>
<a name="RDD.subtract"></a><div id="RDD.subtract-def"><a name="L967"></a><tt class="py-lineno"> 967</tt> <a class="py-toggle" href="#" id="RDD.subtract-toggle" onclick="return toggle('RDD.subtract');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#subtract">subtract</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.subtract-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.subtract-expanded"><a name="L968"></a><tt class="py-lineno"> 968</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L969"></a><tt class="py-lineno"> 969</tt>  <tt class="py-line"><tt class="py-docstring">        Return each value in C{self} that is not contained in C{other}.</tt> </tt>
<a name="L970"></a><tt class="py-lineno"> 970</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L971"></a><tt class="py-lineno"> 971</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize([("a", 1), ("b", 4), ("b", 5), ("a", 3)])</tt> </tt>
<a name="L972"></a><tt class="py-lineno"> 972</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize([("a", 3), ("c", None)])</tt> </tt>
<a name="L973"></a><tt class="py-lineno"> 973</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.subtract(y).collect())</tt> </tt>
<a name="L974"></a><tt class="py-lineno"> 974</tt>  <tt class="py-line"><tt class="py-docstring">        [('a', 1), ('b', 4), ('b', 5)]</tt> </tt>
<a name="L975"></a><tt class="py-lineno"> 975</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L976"></a><tt class="py-lineno"> 976</tt>  <tt class="py-line">        <tt id="link-135" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-135', 'rdd', 'link-19');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">other</tt><tt class="py-op">.</tt><tt id="link-136" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-136', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">True</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> <tt class="py-comment"># note: here 'True' is just a placeholder</tt> </tt>
<a name="L977"></a><tt class="py-lineno"> 977</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-137" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-137', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">True</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-138" class="py-name" targets="Method pyspark.rdd.RDD.subtractByKey()=pyspark.rdd.RDD-class.html#subtractByKey"><a title="pyspark.rdd.RDD.subtractByKey" class="py-name" href="#" onclick="return doclink('link-138', 'subtractByKey', 'link-138');">subtractByKey</a></tt><tt class="py-op">(</tt><tt id="link-139" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-139', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-140" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-140', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">tpl</tt><tt class="py-op">:</tt> <tt class="py-name">tpl</tt><tt class="py-op">[</tt><tt class="py-number">0</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> <tt class="py-comment"># note: here 'True' is just a placeholder</tt> </tt>
</div><a name="L978"></a><tt class="py-lineno"> 978</tt>  <tt class="py-line"> </tt>
<a name="RDD.keyBy"></a><div id="RDD.keyBy-def"><a name="L979"></a><tt class="py-lineno"> 979</tt> <a class="py-toggle" href="#" id="RDD.keyBy-toggle" onclick="return toggle('RDD.keyBy');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.RDD-class.html#keyBy">keyBy</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">f</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="RDD.keyBy-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="RDD.keyBy-expanded"><a name="L980"></a><tt class="py-lineno"> 980</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L981"></a><tt class="py-lineno"> 981</tt>  <tt class="py-line"><tt class="py-docstring">        Creates tuples of the elements in this RDD by applying C{f}.</tt> </tt>
<a name="L982"></a><tt class="py-lineno"> 982</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L983"></a><tt class="py-lineno"> 983</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; x = sc.parallelize(range(0,3)).keyBy(lambda x: x*x)</tt> </tt>
<a name="L984"></a><tt class="py-lineno"> 984</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; y = sc.parallelize(zip(range(0,5), range(0,5)))</tt> </tt>
<a name="L985"></a><tt class="py-lineno"> 985</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(x.cogroup(y).collect())</tt> </tt>
<a name="L986"></a><tt class="py-lineno"> 986</tt>  <tt class="py-line"><tt class="py-docstring">        [(0, ([0], [0])), (1, ([1], [1])), (2, ([], [2])), (3, ([], [3])), (4, ([2], [4]))]</tt> </tt>
<a name="L987"></a><tt class="py-lineno"> 987</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L988"></a><tt class="py-lineno"> 988</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-141" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-141', 'map', 'link-30');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">x</tt><tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-name">f</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">x</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L989"></a><tt class="py-lineno"> 989</tt>  <tt class="py-line"> </tt>
<a name="PipelinedRDD"></a><div id="PipelinedRDD-def"><a name="L990"></a><tt class="py-lineno"> 990</tt>  <tt class="py-line">    <tt class="py-comment"># TODO: `lookup` is disabled because we can't make direct comparisons based</tt> </tt>
<a name="L991"></a><tt class="py-lineno"> 991</tt>  <tt class="py-line">    <tt class="py-comment"># on the key; we need to compare the hash of the key to the hash of the</tt> </tt>
<a name="L992"></a><tt class="py-lineno"> 992</tt>  <tt class="py-line">    <tt class="py-comment"># keys in the pairs.  This could be an expensive operation, since those</tt> </tt>
<a name="L993"></a><tt class="py-lineno"> 993</tt>  <tt class="py-line">    <tt class="py-comment"># hashes aren't retained.</tt> </tt>
<a name="L994"></a><tt class="py-lineno"> 994</tt>  <tt class="py-line"> </tt>
<a name="L995"></a><tt class="py-lineno"> 995</tt>  <tt class="py-line"> </tt>
<a name="L996"></a><tt class="py-lineno"> 996</tt> <a class="py-toggle" href="#" id="PipelinedRDD-toggle" onclick="return toggle('PipelinedRDD');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html">PipelinedRDD</a><tt class="py-op">(</tt><tt class="py-base-class">RDD</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="PipelinedRDD-collapsed" style="display:none;" pad="++++" indent="++++"></div><div id="PipelinedRDD-expanded"><a name="L997"></a><tt class="py-lineno"> 997</tt>  <tt class="py-line">    <tt class="py-docstring">"""</tt> </tt>
<a name="L998"></a><tt class="py-lineno"> 998</tt>  <tt class="py-line"><tt class="py-docstring">    Pipelined maps:</tt> </tt>
<a name="L999"></a><tt class="py-lineno"> 999</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; rdd = sc.parallelize([1, 2, 3, 4])</tt> </tt>
<a name="L1000"></a><tt class="py-lineno">1000</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()</tt> </tt>
<a name="L1001"></a><tt class="py-lineno">1001</tt>  <tt class="py-line"><tt class="py-docstring">    [4, 8, 12, 16]</tt> </tt>
<a name="L1002"></a><tt class="py-lineno">1002</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; rdd.map(lambda x: 2 * x).map(lambda x: 2 * x).collect()</tt> </tt>
<a name="L1003"></a><tt class="py-lineno">1003</tt>  <tt class="py-line"><tt class="py-docstring">    [4, 8, 12, 16]</tt> </tt>
<a name="L1004"></a><tt class="py-lineno">1004</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L1005"></a><tt class="py-lineno">1005</tt>  <tt class="py-line"><tt class="py-docstring">    Pipelined reduces:</tt> </tt>
<a name="L1006"></a><tt class="py-lineno">1006</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; from operator import add</tt> </tt>
<a name="L1007"></a><tt class="py-lineno">1007</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; rdd.map(lambda x: 2 * x).reduce(add)</tt> </tt>
<a name="L1008"></a><tt class="py-lineno">1008</tt>  <tt class="py-line"><tt class="py-docstring">    20</tt> </tt>
<a name="L1009"></a><tt class="py-lineno">1009</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; rdd.flatMap(lambda x: [x, x]).reduce(add)</tt> </tt>
<a name="L1010"></a><tt class="py-lineno">1010</tt>  <tt class="py-line"><tt class="py-docstring">    20</tt> </tt>
<a name="L1011"></a><tt class="py-lineno">1011</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="PipelinedRDD.__init__"></a><div id="PipelinedRDD.__init__-def"><a name="L1012"></a><tt class="py-lineno">1012</tt> <a class="py-toggle" href="#" id="PipelinedRDD.__init__-toggle" onclick="return toggle('PipelinedRDD.__init__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">prev</tt><tt class="py-op">,</tt> <tt class="py-param">func</tt><tt class="py-op">,</tt> <tt class="py-param">preservesPartitioning</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="PipelinedRDD.__init__-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="PipelinedRDD.__init__-expanded"><a name="L1013"></a><tt class="py-lineno">1013</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">prev</tt><tt class="py-op">,</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">)</tt> <tt class="py-keyword">or</tt> <tt class="py-keyword">not</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_is_pipelinable</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L1014"></a><tt class="py-lineno">1014</tt>  <tt class="py-line">            <tt class="py-comment"># This transformation is the first in its stage:</tt> </tt>
<a name="L1015"></a><tt class="py-lineno">1015</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> <tt class="py-op">=</tt> <tt class="py-name">func</tt> </tt>
<a name="L1016"></a><tt class="py-lineno">1016</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt> <tt class="py-op">=</tt> <tt class="py-name">preservesPartitioning</tt> </tt>
<a name="L1017"></a><tt class="py-lineno">1017</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt> </tt>
<a name="L1018"></a><tt class="py-lineno">1018</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd_deserializer</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt> </tt>
<a name="L1019"></a><tt class="py-lineno">1019</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L1020"></a><tt class="py-lineno">1020</tt>  <tt class="py-line">            <tt class="py-name">prev_func</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> </tt>
<a name="L1021"></a><tt class="py-lineno">1021</tt>  <tt class="py-line">            <tt class="py-keyword">def</tt> <tt class="py-def-name">pipeline_func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L1022"></a><tt class="py-lineno">1022</tt>  <tt class="py-line">                <tt class="py-keyword">return</tt> <tt class="py-name">func</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">prev_func</tt><tt class="py-op">(</tt><tt class="py-name">split</tt><tt class="py-op">,</tt> <tt class="py-name">iterator</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L1023"></a><tt class="py-lineno">1023</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt> <tt class="py-op">=</tt> <tt class="py-name">pipeline_func</tt> </tt>
<a name="L1024"></a><tt class="py-lineno">1024</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt> <tt class="py-op">=</tt> \ </tt>
<a name="L1025"></a><tt class="py-lineno">1025</tt>  <tt class="py-line">                <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt> <tt class="py-keyword">and</tt> <tt class="py-name">preservesPartitioning</tt> </tt>
<a name="L1026"></a><tt class="py-lineno">1026</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt>  <tt class="py-comment"># maintain the pipeline</tt> </tt>
<a name="L1027"></a><tt class="py-lineno">1027</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd_deserializer</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd_deserializer</tt> </tt>
<a name="L1028"></a><tt class="py-lineno">1028</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L1029"></a><tt class="py-lineno">1029</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L1030"></a><tt class="py-lineno">1030</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> </tt>
<a name="L1031"></a><tt class="py-lineno">1031</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">prev</tt> <tt class="py-op">=</tt> <tt class="py-name">prev</tt> </tt>
<a name="L1032"></a><tt class="py-lineno">1032</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt> </tt>
<a name="L1033"></a><tt class="py-lineno">1033</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">serializer</tt> </tt>
<a name="L1034"></a><tt class="py-lineno">1034</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
</div><a name="L1035"></a><tt class="py-lineno">1035</tt>  <tt class="py-line"> </tt>
<a name="L1036"></a><tt class="py-lineno">1036</tt>  <tt class="py-line">    <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
<a name="PipelinedRDD._jrdd"></a><div id="PipelinedRDD._jrdd-def"><a name="L1037"></a><tt class="py-lineno">1037</tt> <a class="py-toggle" href="#" id="PipelinedRDD._jrdd-toggle" onclick="return toggle('PipelinedRDD._jrdd');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html#_jrdd">_jrdd</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="PipelinedRDD._jrdd-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="PipelinedRDD._jrdd-expanded"><a name="L1038"></a><tt class="py-lineno">1038</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt><tt class="py-op">:</tt> </tt>
<a name="L1039"></a><tt class="py-lineno">1039</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> </tt>
<a name="L1040"></a><tt class="py-lineno">1040</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt><tt class="py-op">:</tt> </tt>
<a name="L1041"></a><tt class="py-lineno">1041</tt>  <tt class="py-line">            <tt class="py-name">serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">NoOpSerializer</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1042"></a><tt class="py-lineno">1042</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L1043"></a><tt class="py-lineno">1043</tt>  <tt class="py-line">            <tt class="py-name">serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">serializer</tt> </tt>
<a name="L1044"></a><tt class="py-lineno">1044</tt>  <tt class="py-line">        <tt class="py-name">command</tt> <tt class="py-op">=</tt> <tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">func</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd_deserializer</tt><tt class="py-op">,</tt> <tt class="py-name">serializer</tt><tt class="py-op">)</tt> </tt>
<a name="L1045"></a><tt class="py-lineno">1045</tt>  <tt class="py-line">        <tt class="py-name">pickled_command</tt> <tt class="py-op">=</tt> <tt class="py-name">CloudPickleSerializer</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-142" class="py-name"><a title="pyspark.serializers.MarshalSerializer.dumps
pyspark.serializers.PickleSerializer.dumps" class="py-name" href="#" onclick="return doclink('link-142', 'dumps', 'link-114');">dumps</a></tt><tt class="py-op">(</tt><tt class="py-name">command</tt><tt class="py-op">)</tt> </tt>
<a name="L1046"></a><tt class="py-lineno">1046</tt>  <tt class="py-line">        <tt class="py-name">broadcast_vars</tt> <tt class="py-op">=</tt> <tt class="py-name">ListConverter</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">convert</tt><tt class="py-op">(</tt> </tt>
<a name="L1047"></a><tt class="py-lineno">1047</tt>  <tt class="py-line">            <tt class="py-op">[</tt><tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">_jbroadcast</tt> <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_pickled_broadcast_vars</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> </tt>
<a name="L1048"></a><tt class="py-lineno">1048</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-143" class="py-name"><a title="pyspark.context.SparkContext._gateway" class="py-name" href="#" onclick="return doclink('link-143', '_gateway', 'link-101');">_gateway</a></tt><tt class="py-op">.</tt><tt class="py-name">_gateway_client</tt><tt class="py-op">)</tt> </tt>
<a name="L1049"></a><tt class="py-lineno">1049</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_pickled_broadcast_vars</tt><tt class="py-op">.</tt><tt class="py-name">clear</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1050"></a><tt class="py-lineno">1050</tt>  <tt class="py-line">        <tt class="py-name">class_tag</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt><tt class="py-op">.</tt><tt class="py-name">classTag</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1051"></a><tt class="py-lineno">1051</tt>  <tt class="py-line">        <tt class="py-name">env</tt> <tt class="py-op">=</tt> <tt class="py-name">MapConverter</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">convert</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">environment</tt><tt class="py-op">,</tt> </tt>
<a name="L1052"></a><tt class="py-lineno">1052</tt>  <tt class="py-line">                                     <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-144" class="py-name"><a title="pyspark.context.SparkContext._gateway" class="py-name" href="#" onclick="return doclink('link-144', '_gateway', 'link-101');">_gateway</a></tt><tt class="py-op">.</tt><tt class="py-name">_gateway_client</tt><tt class="py-op">)</tt> </tt>
<a name="L1053"></a><tt class="py-lineno">1053</tt>  <tt class="py-line">        <tt class="py-name">includes</tt> <tt class="py-op">=</tt> <tt class="py-name">ListConverter</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">convert</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-145" class="py-name" targets="Variable pyspark.context.SparkContext._python_includes=pyspark.context.SparkContext-class.html#_python_includes"><a title="pyspark.context.SparkContext._python_includes" class="py-name" href="#" onclick="return doclink('link-145', '_python_includes', 'link-145');">_python_includes</a></tt><tt class="py-op">,</tt> </tt>
<a name="L1054"></a><tt class="py-lineno">1054</tt>  <tt class="py-line">                                     <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-146" class="py-name"><a title="pyspark.context.SparkContext._gateway" class="py-name" href="#" onclick="return doclink('link-146', '_gateway', 'link-101');">_gateway</a></tt><tt class="py-op">.</tt><tt class="py-name">_gateway_client</tt><tt class="py-op">)</tt> </tt>
<a name="L1055"></a><tt class="py-lineno">1055</tt>  <tt class="py-line">        <tt class="py-name">python_rdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt id="link-147" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-147', '_jvm', 'link-102');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PythonRDD</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_prev_jrdd</tt><tt class="py-op">.</tt><tt id="link-148" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-148', 'rdd', 'link-19');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> </tt>
<a name="L1056"></a><tt class="py-lineno">1056</tt>  <tt class="py-line">            <tt class="py-name">bytearray</tt><tt class="py-op">(</tt><tt class="py-name">pickled_command</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">env</tt><tt class="py-op">,</tt> <tt class="py-name">includes</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">preservesPartitioning</tt><tt class="py-op">,</tt> </tt>
<a name="L1057"></a><tt class="py-lineno">1057</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">pythonExec</tt><tt class="py-op">,</tt> <tt class="py-name">broadcast_vars</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_javaAccumulator</tt><tt class="py-op">,</tt> </tt>
<a name="L1058"></a><tt class="py-lineno">1058</tt>  <tt class="py-line">            <tt class="py-name">class_tag</tt><tt class="py-op">)</tt> </tt>
<a name="L1059"></a><tt class="py-lineno">1059</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> <tt class="py-op">=</tt> <tt class="py-name">python_rdd</tt><tt class="py-op">.</tt><tt class="py-name">asJavaRDD</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1060"></a><tt class="py-lineno">1060</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_val</tt> </tt>
</div><a name="L1061"></a><tt class="py-lineno">1061</tt>  <tt class="py-line"> </tt>
<a name="PipelinedRDD._is_pipelinable"></a><div id="PipelinedRDD._is_pipelinable-def"><a name="L1062"></a><tt class="py-lineno">1062</tt> <a class="py-toggle" href="#" id="PipelinedRDD._is_pipelinable-toggle" onclick="return toggle('PipelinedRDD._is_pipelinable');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd.PipelinedRDD-class.html#_is_pipelinable">_is_pipelinable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="PipelinedRDD._is_pipelinable-collapsed" style="display:none;" pad="++++" indent="++++++++"></div><div id="PipelinedRDD._is_pipelinable-expanded"><a name="L1063"></a><tt class="py-lineno">1063</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-keyword">not</tt> <tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-keyword">or</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L1064"></a><tt class="py-lineno">1064</tt>  <tt class="py-line"> </tt>
<a name="_test"></a><div id="_test-def"><a name="L1065"></a><tt class="py-lineno">1065</tt>  <tt class="py-line"> </tt>
<a name="L1066"></a><tt class="py-lineno">1066</tt> <a class="py-toggle" href="#" id="_test-toggle" onclick="return toggle('_test');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.rdd-module.html#_test">_test</a><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_test-collapsed" style="display:none;" pad="++++" indent="++++"></div><div id="_test-expanded"><a name="L1067"></a><tt class="py-lineno">1067</tt>  <tt class="py-line">    <tt class="py-keyword">import</tt> <tt class="py-name">doctest</tt> </tt>
<a name="L1068"></a><tt class="py-lineno">1068</tt>  <tt class="py-line">    <tt class="py-keyword">from</tt> <tt id="link-149" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-149', 'pyspark', 'link-1');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-150" class="py-name"><a title="pyspark.context
pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-150', 'context', 'link-67');">context</a></tt> <tt class="py-keyword">import</tt> <tt id="link-151" class="py-name" targets="Class pyspark.context.SparkContext=pyspark.context.SparkContext-class.html"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-151', 'SparkContext', 'link-151');">SparkContext</a></tt> </tt>
<a name="L1069"></a><tt class="py-lineno">1069</tt>  <tt class="py-line">    <tt class="py-name">globs</tt> <tt class="py-op">=</tt> <tt class="py-name">globals</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-152" class="py-name"><a title="pyspark.statcounter.StatCounter.copy" class="py-name" href="#" onclick="return doclink('link-152', 'copy', 'link-0');">copy</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1070"></a><tt class="py-lineno">1070</tt>  <tt class="py-line">    <tt class="py-comment"># The small batch size here ensures that we see multiple batches,</tt> </tt>
<a name="L1071"></a><tt class="py-lineno">1071</tt>  <tt class="py-line">    <tt class="py-comment"># even in these small test examples:</tt> </tt>
<a name="L1072"></a><tt class="py-lineno">1072</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt id="link-153" class="py-name"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-153', 'SparkContext', 'link-151');">SparkContext</a></tt><tt class="py-op">(</tt><tt class="py-string">'local[4]'</tt><tt class="py-op">,</tt> <tt class="py-string">'PythonTest'</tt><tt class="py-op">,</tt> <tt class="py-name">batchSize</tt><tt class="py-op">=</tt><tt class="py-number">2</tt><tt class="py-op">)</tt> </tt>
<a name="L1073"></a><tt class="py-lineno">1073</tt>  <tt class="py-line">    <tt class="py-op">(</tt><tt class="py-name">failure_count</tt><tt class="py-op">,</tt> <tt class="py-name">test_count</tt><tt class="py-op">)</tt> <tt class="py-op">=</tt> <tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">testmod</tt><tt class="py-op">(</tt><tt class="py-name">globs</tt><tt class="py-op">=</tt><tt class="py-name">globs</tt><tt class="py-op">,</tt><tt class="py-name">optionflags</tt><tt class="py-op">=</tt><tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">ELLIPSIS</tt><tt class="py-op">)</tt> </tt>
<a name="L1074"></a><tt class="py-lineno">1074</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt id="link-154" class="py-name" targets="Method pyspark.context.SparkContext.stop()=pyspark.context.SparkContext-class.html#stop"><a title="pyspark.context.SparkContext.stop" class="py-name" href="#" onclick="return doclink('link-154', 'stop', 'link-154');">stop</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1075"></a><tt class="py-lineno">1075</tt>  <tt class="py-line">    <tt class="py-keyword">if</tt> <tt class="py-name">failure_count</tt><tt class="py-op">:</tt> </tt>
<a name="L1076"></a><tt class="py-lineno">1076</tt>  <tt class="py-line">        <tt class="py-name">exit</tt><tt class="py-op">(</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
</div><a name="L1077"></a><tt class="py-lineno">1077</tt>  <tt class="py-line"> </tt>
<a name="L1078"></a><tt class="py-lineno">1078</tt>  <tt class="py-line"> </tt>
<a name="L1079"></a><tt class="py-lineno">1079</tt>  <tt class="py-line"><tt class="py-keyword">if</tt> <tt class="py-name">__name__</tt> <tt class="py-op">==</tt> <tt class="py-string">"__main__"</tt><tt class="py-op">:</tt> </tt>
<a name="L1080"></a><tt class="py-lineno">1080</tt>  <tt class="py-line">    <tt class="py-name">_test</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L1081"></a><tt class="py-lineno">1081</tt>  <tt class="py-line"> </tt><script type="text/javascript">
<!--
expandto(location.href);
// -->
</script>
</pre>
<br />
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://spark-project.org">PySpark</a></th>
          </tr></table></th>
  </tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="100%%">
  <tr>
    <td align="left" class="footer">
    Generated by Epydoc 3.0.1 on Sun Mar  2 16:35:00 2014
    </td>
    <td align="right" class="footer">
      <a target="mainFrame" href="http://epydoc.sourceforge.net"
        >http://epydoc.sourceforge.net</a>
    </td>
  </tr>
</table>

<script type="text/javascript">
  <!--
  // Private objects are initially displayed (because if
  // javascript is turned off then we want them to be
  // visible); but by default, we want to hide them.  So hide
  // them unless we have a cookie that says to show them.
  checkCookie();
  // -->
</script>
</body>
</html>