summaryrefslogtreecommitdiff
path: root/site/docs/1.0.1/api/python/pyspark.sql-pysrc.html
blob: 8c51878ef3b2a6ba05bf7ecd72a251e076690125 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
<?xml version="1.0" encoding="ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
          "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <title>pyspark.sql</title>
  <link rel="stylesheet" href="epydoc.css" type="text/css" />
  <script type="text/javascript" src="epydoc.js"></script>
</head>

<body bgcolor="white" text="black" link="blue" vlink="#204080"
      alink="#204080">
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://spark.apache.org">Spark 1.0.1 Python API Docs</a></th>
          </tr></table></th>
  </tr>
</table>
<table width="100%" cellpadding="0" cellspacing="0">
  <tr valign="top">
    <td width="100%">
      <span class="breadcrumbs">
        <a href="pyspark-module.html">Package&nbsp;pyspark</a> ::
        Module&nbsp;sql
      </span>
    </td>
    <td>
      <table cellpadding="0" cellspacing="0">
        <!-- hide/show private -->
        <tr><td align="right"><span class="options"
            >[<a href="frames.html" target="_top">frames</a
            >]&nbsp;|&nbsp;<a href="pyspark.sql-pysrc.html"
            target="_top">no&nbsp;frames</a>]</span></td></tr>
      </table>
    </td>
  </tr>
</table>
<h1 class="epydoc">Source Code for <a href="pyspark.sql-module.html">Module pyspark.sql</a></h1>
<pre class="py-src">
<a name="L1"></a><tt class="py-lineno">  1</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L2"></a><tt class="py-lineno">  2</tt>  <tt class="py-line"><tt class="py-comment"># Licensed to the Apache Software Foundation (ASF) under one or more</tt> </tt>
<a name="L3"></a><tt class="py-lineno">  3</tt>  <tt class="py-line"><tt class="py-comment"># contributor license agreements.  See the NOTICE file distributed with</tt> </tt>
<a name="L4"></a><tt class="py-lineno">  4</tt>  <tt class="py-line"><tt class="py-comment"># this work for additional information regarding copyright ownership.</tt> </tt>
<a name="L5"></a><tt class="py-lineno">  5</tt>  <tt class="py-line"><tt class="py-comment"># The ASF licenses this file to You under the Apache License, Version 2.0</tt> </tt>
<a name="L6"></a><tt class="py-lineno">  6</tt>  <tt class="py-line"><tt class="py-comment"># (the "License"); you may not use this file except in compliance with</tt> </tt>
<a name="L7"></a><tt class="py-lineno">  7</tt>  <tt class="py-line"><tt class="py-comment"># the License.  You may obtain a copy of the License at</tt> </tt>
<a name="L8"></a><tt class="py-lineno">  8</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L9"></a><tt class="py-lineno">  9</tt>  <tt class="py-line"><tt class="py-comment">#    http://www.apache.org/licenses/LICENSE-2.0</tt> </tt>
<a name="L10"></a><tt class="py-lineno"> 10</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L11"></a><tt class="py-lineno"> 11</tt>  <tt class="py-line"><tt class="py-comment"># Unless required by applicable law or agreed to in writing, software</tt> </tt>
<a name="L12"></a><tt class="py-lineno"> 12</tt>  <tt class="py-line"><tt class="py-comment"># distributed under the License is distributed on an "AS IS" BASIS,</tt> </tt>
<a name="L13"></a><tt class="py-lineno"> 13</tt>  <tt class="py-line"><tt class="py-comment"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</tt> </tt>
<a name="L14"></a><tt class="py-lineno"> 14</tt>  <tt class="py-line"><tt class="py-comment"># See the License for the specific language governing permissions and</tt> </tt>
<a name="L15"></a><tt class="py-lineno"> 15</tt>  <tt class="py-line"><tt class="py-comment"># limitations under the License.</tt> </tt>
<a name="L16"></a><tt class="py-lineno"> 16</tt>  <tt class="py-line"><tt class="py-comment">#</tt> </tt>
<a name="L17"></a><tt class="py-lineno"> 17</tt>  <tt class="py-line"> </tt>
<a name="L18"></a><tt class="py-lineno"> 18</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-0" class="py-name" targets="Package pyspark=pyspark-module.html"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-0', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-1" class="py-name" targets="Module pyspark.rdd=pyspark.rdd-module.html"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-1', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-keyword">import</tt> <tt id="link-2" class="py-name" targets="Class pyspark.rdd.RDD=pyspark.rdd.RDD-class.html"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-2', 'RDD', 'link-2');">RDD</a></tt><tt class="py-op">,</tt> <tt class="py-name">PipelinedRDD</tt> </tt>
<a name="L19"></a><tt class="py-lineno"> 19</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt id="link-3" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-3', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-4" class="py-name" targets="Module pyspark.serializers=pyspark.serializers-module.html"><a title="pyspark.serializers" class="py-name" href="#" onclick="return doclink('link-4', 'serializers', 'link-4');">serializers</a></tt> <tt class="py-keyword">import</tt> <tt class="py-name">BatchedSerializer</tt><tt class="py-op">,</tt> <tt id="link-5" class="py-name" targets="Class pyspark.serializers.PickleSerializer=pyspark.serializers.PickleSerializer-class.html"><a title="pyspark.serializers.PickleSerializer" class="py-name" href="#" onclick="return doclink('link-5', 'PickleSerializer', 'link-5');">PickleSerializer</a></tt> </tt>
<a name="L20"></a><tt class="py-lineno"> 20</tt>  <tt class="py-line"> </tt>
<a name="L21"></a><tt class="py-lineno"> 21</tt>  <tt class="py-line"><tt class="py-keyword">from</tt> <tt class="py-name">py4j</tt><tt class="py-op">.</tt><tt class="py-name">protocol</tt> <tt class="py-keyword">import</tt> <tt class="py-name">Py4JError</tt> </tt>
<a name="L22"></a><tt class="py-lineno"> 22</tt>  <tt class="py-line"> </tt>
<a name="L23"></a><tt class="py-lineno"> 23</tt>  <tt class="py-line"><tt class="py-name">__all__</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-string">"SQLContext"</tt><tt class="py-op">,</tt> <tt class="py-string">"HiveContext"</tt><tt class="py-op">,</tt> <tt class="py-string">"LocalHiveContext"</tt><tt class="py-op">,</tt> <tt class="py-string">"TestHiveContext"</tt><tt class="py-op">,</tt> <tt class="py-string">"SchemaRDD"</tt><tt class="py-op">,</tt> <tt class="py-string">"Row"</tt><tt class="py-op">]</tt> </tt>
<a name="SQLContext"></a><div id="SQLContext-def"><a name="L24"></a><tt class="py-lineno"> 24</tt>  <tt class="py-line"> </tt>
<a name="L25"></a><tt class="py-lineno"> 25</tt>  <tt class="py-line"> </tt>
<a name="L26"></a><tt class="py-lineno"> 26</tt> <a class="py-toggle" href="#" id="SQLContext-toggle" onclick="return toggle('SQLContext');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html">SQLContext</a><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="SQLContext-expanded"><a name="L27"></a><tt class="py-lineno"> 27</tt>  <tt class="py-line">    <tt class="py-docstring">"""Main entry point for SparkSQL functionality.</tt> </tt>
<a name="L28"></a><tt class="py-lineno"> 28</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L29"></a><tt class="py-lineno"> 29</tt>  <tt class="py-line"><tt class="py-docstring">    A SQLContext can be used create L{SchemaRDD}s, register L{SchemaRDD}s as</tt> </tt>
<a name="L30"></a><tt class="py-lineno"> 30</tt>  <tt class="py-line"><tt class="py-docstring">    tables, execute SQL over tables, cache tables, and read parquet files.</tt> </tt>
<a name="L31"></a><tt class="py-lineno"> 31</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="L32"></a><tt class="py-lineno"> 32</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.__init__"></a><div id="SQLContext.__init__-def"><a name="L33"></a><tt class="py-lineno"> 33</tt> <a class="py-toggle" href="#" id="SQLContext.__init__-toggle" onclick="return toggle('SQLContext.__init__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">sparkContext</tt><tt class="py-op">,</tt> <tt class="py-param">sqlContext</tt> <tt class="py-op">=</tt> <tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.__init__-expanded"><a name="L34"></a><tt class="py-lineno"> 34</tt>  <tt class="py-line">        <tt class="py-docstring">"""Create a new SQLContext.</tt> </tt>
<a name="L35"></a><tt class="py-lineno"> 35</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L36"></a><tt class="py-lineno"> 36</tt>  <tt class="py-line"><tt class="py-docstring">        @param sparkContext: The SparkContext to wrap.</tt> </tt>
<a name="L37"></a><tt class="py-lineno"> 37</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L38"></a><tt class="py-lineno"> 38</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L39"></a><tt class="py-lineno"> 39</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.inferSchema(srdd) # doctest: +IGNORE_EXCEPTION_DETAIL</tt> </tt>
<a name="L40"></a><tt class="py-lineno"> 40</tt>  <tt class="py-line"><tt class="py-docstring">        Traceback (most recent call last):</tt> </tt>
<a name="L41"></a><tt class="py-lineno"> 41</tt>  <tt class="py-line"><tt class="py-docstring">            ...</tt> </tt>
<a name="L42"></a><tt class="py-lineno"> 42</tt>  <tt class="py-line"><tt class="py-docstring">        ValueError:...</tt> </tt>
<a name="L43"></a><tt class="py-lineno"> 43</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L44"></a><tt class="py-lineno"> 44</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; bad_rdd = sc.parallelize([1,2,3])</tt> </tt>
<a name="L45"></a><tt class="py-lineno"> 45</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.inferSchema(bad_rdd) # doctest: +IGNORE_EXCEPTION_DETAIL</tt> </tt>
<a name="L46"></a><tt class="py-lineno"> 46</tt>  <tt class="py-line"><tt class="py-docstring">        Traceback (most recent call last):</tt> </tt>
<a name="L47"></a><tt class="py-lineno"> 47</tt>  <tt class="py-line"><tt class="py-docstring">            ...</tt> </tt>
<a name="L48"></a><tt class="py-lineno"> 48</tt>  <tt class="py-line"><tt class="py-docstring">        ValueError:...</tt> </tt>
<a name="L49"></a><tt class="py-lineno"> 49</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L50"></a><tt class="py-lineno"> 50</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; allTypes = sc.parallelize([{"int" : 1, "string" : "string", "double" : 1.0, "long": 1L,</tt> </tt>
<a name="L51"></a><tt class="py-lineno"> 51</tt>  <tt class="py-line"><tt class="py-docstring">        ... "boolean" : True}])</tt> </tt>
<a name="L52"></a><tt class="py-lineno"> 52</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(allTypes).map(lambda x: (x.int, x.string, x.double, x.long,</tt> </tt>
<a name="L53"></a><tt class="py-lineno"> 53</tt>  <tt class="py-line"><tt class="py-docstring">        ... x.boolean))</tt> </tt>
<a name="L54"></a><tt class="py-lineno"> 54</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.collect()[0]</tt> </tt>
<a name="L55"></a><tt class="py-lineno"> 55</tt>  <tt class="py-line"><tt class="py-docstring">        (1, u'string', 1.0, 1, True)</tt> </tt>
<a name="L56"></a><tt class="py-lineno"> 56</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L57"></a><tt class="py-lineno"> 57</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-6" class="py-name" targets="Variable pyspark.files.SparkFiles._sc=pyspark.files.SparkFiles-class.html#_sc"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-6', '_sc', 'link-6');">_sc</a></tt> <tt class="py-op">=</tt> <tt class="py-name">sparkContext</tt> </tt>
<a name="L58"></a><tt class="py-lineno"> 58</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-7" class="py-name"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-7', '_sc', 'link-6');">_sc</a></tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt> </tt>
<a name="L59"></a><tt class="py-lineno"> 59</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-8" class="py-name" targets="Variable pyspark.context.SparkContext._jvm=pyspark.context.SparkContext-class.html#_jvm"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-8', '_jvm', 'link-8');">_jvm</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-9" class="py-name"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-9', '_sc', 'link-6');">_sc</a></tt><tt class="py-op">.</tt><tt id="link-10" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-10', '_jvm', 'link-8');">_jvm</a></tt> </tt>
<a name="L60"></a><tt class="py-lineno"> 60</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_pythonToJavaMap</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-11" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-11', '_jvm', 'link-8');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">PythonRDD</tt><tt class="py-op">.</tt><tt class="py-name">pythonToJavaMap</tt> </tt>
<a name="L61"></a><tt class="py-lineno"> 61</tt>  <tt class="py-line"> </tt>
<a name="L62"></a><tt class="py-lineno"> 62</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">sqlContext</tt><tt class="py-op">:</tt> </tt>
<a name="L63"></a><tt class="py-lineno"> 63</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_scala_SQLContext</tt> <tt class="py-op">=</tt> <tt class="py-name">sqlContext</tt> </tt>
</div><a name="L64"></a><tt class="py-lineno"> 64</tt>  <tt class="py-line"> </tt>
<a name="L65"></a><tt class="py-lineno"> 65</tt>  <tt class="py-line">    <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
<a name="SQLContext._ssql_ctx"></a><div id="SQLContext._ssql_ctx-def"><a name="L66"></a><tt class="py-lineno"> 66</tt> <a class="py-toggle" href="#" id="SQLContext._ssql_ctx-toggle" onclick="return toggle('SQLContext._ssql_ctx');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#_ssql_ctx">_ssql_ctx</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext._ssql_ctx-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext._ssql_ctx-expanded"><a name="L67"></a><tt class="py-lineno"> 67</tt>  <tt class="py-line">        <tt class="py-docstring">"""Accessor for the JVM SparkSQL context.</tt> </tt>
<a name="L68"></a><tt class="py-lineno"> 68</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L69"></a><tt class="py-lineno"> 69</tt>  <tt class="py-line"><tt class="py-docstring">        Subclasses can override this property to provide their own</tt> </tt>
<a name="L70"></a><tt class="py-lineno"> 70</tt>  <tt class="py-line"><tt class="py-docstring">        JVM Contexts.</tt> </tt>
<a name="L71"></a><tt class="py-lineno"> 71</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L72"></a><tt class="py-lineno"> 72</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">hasattr</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-string">'_scala_SQLContext'</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L73"></a><tt class="py-lineno"> 73</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_scala_SQLContext</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-12" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-12', '_jvm', 'link-8');">_jvm</a></tt><tt class="py-op">.</tt><tt id="link-13" class="py-name" targets="Class pyspark.sql.SQLContext=pyspark.sql.SQLContext-class.html"><a title="pyspark.sql.SQLContext" class="py-name" href="#" onclick="return doclink('link-13', 'SQLContext', 'link-13');">SQLContext</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt><tt class="py-op">.</tt><tt class="py-name">sc</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L74"></a><tt class="py-lineno"> 74</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_scala_SQLContext</tt> </tt>
</div><a name="L75"></a><tt class="py-lineno"> 75</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.inferSchema"></a><div id="SQLContext.inferSchema-def"><a name="L76"></a><tt class="py-lineno"> 76</tt> <a class="py-toggle" href="#" id="SQLContext.inferSchema-toggle" onclick="return toggle('SQLContext.inferSchema');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#inferSchema">inferSchema</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">rdd</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.inferSchema-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.inferSchema-expanded"><a name="L77"></a><tt class="py-lineno"> 77</tt>  <tt class="py-line">        <tt class="py-docstring">"""Infer and apply a schema to an RDD of L{dict}s.</tt> </tt>
<a name="L78"></a><tt class="py-lineno"> 78</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L79"></a><tt class="py-lineno"> 79</tt>  <tt class="py-line"><tt class="py-docstring">        We peek at the first row of the RDD to determine the fields names</tt> </tt>
<a name="L80"></a><tt class="py-lineno"> 80</tt>  <tt class="py-line"><tt class="py-docstring">        and types, and then use that to extract all the dictionaries. Nested</tt> </tt>
<a name="L81"></a><tt class="py-lineno"> 81</tt>  <tt class="py-line"><tt class="py-docstring">        collections are supported, which include array, dict, list, set, and</tt> </tt>
<a name="L82"></a><tt class="py-lineno"> 82</tt>  <tt class="py-line"><tt class="py-docstring">        tuple.</tt> </tt>
<a name="L83"></a><tt class="py-lineno"> 83</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L84"></a><tt class="py-lineno"> 84</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L85"></a><tt class="py-lineno"> 85</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.collect() == [{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"},</tt> </tt>
<a name="L86"></a><tt class="py-lineno"> 86</tt>  <tt class="py-line"><tt class="py-docstring">        ...                    {"field1" : 3, "field2": "row3"}]</tt> </tt>
<a name="L87"></a><tt class="py-lineno"> 87</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L88"></a><tt class="py-lineno"> 88</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L89"></a><tt class="py-lineno"> 89</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; from array import array</tt> </tt>
<a name="L90"></a><tt class="py-lineno"> 90</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(nestedRdd1)</tt> </tt>
<a name="L91"></a><tt class="py-lineno"> 91</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.collect() == [{"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},</tt> </tt>
<a name="L92"></a><tt class="py-lineno"> 92</tt>  <tt class="py-line"><tt class="py-docstring">        ...                    {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]</tt> </tt>
<a name="L93"></a><tt class="py-lineno"> 93</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L94"></a><tt class="py-lineno"> 94</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L95"></a><tt class="py-lineno"> 95</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(nestedRdd2)</tt> </tt>
<a name="L96"></a><tt class="py-lineno"> 96</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},</tt> </tt>
<a name="L97"></a><tt class="py-lineno"> 97</tt>  <tt class="py-line"><tt class="py-docstring">        ...                    {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]</tt> </tt>
<a name="L98"></a><tt class="py-lineno"> 98</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L99"></a><tt class="py-lineno"> 99</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L100"></a><tt class="py-lineno">100</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-op">(</tt><tt id="link-14" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-14', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">.</tt><tt class="py-name">__class__</tt> <tt class="py-keyword">is</tt> <tt id="link-15" class="py-name" targets="Class pyspark.sql.SchemaRDD=pyspark.sql.SchemaRDD-class.html"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-15', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L101"></a><tt class="py-lineno">101</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">ValueError</tt><tt class="py-op">(</tt><tt class="py-string">"Cannot apply schema to %s"</tt> <tt class="py-op">%</tt> <tt id="link-16" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-16', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">.</tt><tt class="py-name">__name__</tt><tt class="py-op">)</tt> </tt>
<a name="L102"></a><tt class="py-lineno">102</tt>  <tt class="py-line">        <tt class="py-keyword">elif</tt> <tt class="py-keyword">not</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt id="link-17" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-17', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">.</tt><tt id="link-18" class="py-name" targets="Method pyspark.rdd.RDD.first()=pyspark.rdd.RDD-class.html#first"><a title="pyspark.rdd.RDD.first" class="py-name" href="#" onclick="return doclink('link-18', 'first', 'link-18');">first</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">dict</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L103"></a><tt class="py-lineno">103</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">ValueError</tt><tt class="py-op">(</tt><tt class="py-string">"Only RDDs with dictionaries can be converted to %s: %s"</tt> <tt class="py-op">%</tt> </tt>
<a name="L104"></a><tt class="py-lineno">104</tt>  <tt class="py-line">                             <tt class="py-op">(</tt><tt id="link-19" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-19', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">.</tt><tt class="py-name">__name__</tt><tt class="py-op">,</tt> <tt id="link-20" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-20', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">.</tt><tt id="link-21" class="py-name"><a title="pyspark.rdd.RDD.first" class="py-name" href="#" onclick="return doclink('link-21', 'first', 'link-18');">first</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L105"></a><tt class="py-lineno">105</tt>  <tt class="py-line"> </tt>
<a name="L106"></a><tt class="py-lineno">106</tt>  <tt class="py-line">        <tt class="py-name">jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_pythonToJavaMap</tt><tt class="py-op">(</tt><tt id="link-22" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-22', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">)</tt> </tt>
<a name="L107"></a><tt class="py-lineno">107</tt>  <tt class="py-line">        <tt class="py-name">srdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-23" class="py-name" targets="Method pyspark.sql.SQLContext.inferSchema()=pyspark.sql.SQLContext-class.html#inferSchema"><a title="pyspark.sql.SQLContext.inferSchema" class="py-name" href="#" onclick="return doclink('link-23', 'inferSchema', 'link-23');">inferSchema</a></tt><tt class="py-op">(</tt><tt class="py-name">jrdd</tt><tt class="py-op">.</tt><tt id="link-24" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-24', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L108"></a><tt class="py-lineno">108</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-25" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-25', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">srdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L109"></a><tt class="py-lineno">109</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.registerRDDAsTable"></a><div id="SQLContext.registerRDDAsTable-def"><a name="L110"></a><tt class="py-lineno">110</tt> <a class="py-toggle" href="#" id="SQLContext.registerRDDAsTable-toggle" onclick="return toggle('SQLContext.registerRDDAsTable');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#registerRDDAsTable">registerRDDAsTable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">rdd</tt><tt class="py-op">,</tt> <tt class="py-param">tableName</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.registerRDDAsTable-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.registerRDDAsTable-expanded"><a name="L111"></a><tt class="py-lineno">111</tt>  <tt class="py-line">        <tt class="py-docstring">"""Registers the given RDD as a temporary table in the catalog.</tt> </tt>
<a name="L112"></a><tt class="py-lineno">112</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L113"></a><tt class="py-lineno">113</tt>  <tt class="py-line"><tt class="py-docstring">        Temporary tables exist only during the lifetime of this instance of</tt> </tt>
<a name="L114"></a><tt class="py-lineno">114</tt>  <tt class="py-line"><tt class="py-docstring">        SQLContext.</tt> </tt>
<a name="L115"></a><tt class="py-lineno">115</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L116"></a><tt class="py-lineno">116</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L117"></a><tt class="py-lineno">117</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.registerRDDAsTable(srdd, "table1")</tt> </tt>
<a name="L118"></a><tt class="py-lineno">118</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L119"></a><tt class="py-lineno">119</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-op">(</tt><tt id="link-26" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-26', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">.</tt><tt class="py-name">__class__</tt> <tt class="py-keyword">is</tt> <tt id="link-27" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-27', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L120"></a><tt class="py-lineno">120</tt>  <tt class="py-line">            <tt class="py-name">jschema_rdd</tt> <tt class="py-op">=</tt> <tt id="link-28" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-28', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt> </tt>
<a name="L121"></a><tt class="py-lineno">121</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-29" class="py-name" targets="Method pyspark.sql.SQLContext.registerRDDAsTable()=pyspark.sql.SQLContext-class.html#registerRDDAsTable"><a title="pyspark.sql.SQLContext.registerRDDAsTable" class="py-name" href="#" onclick="return doclink('link-29', 'registerRDDAsTable', 'link-29');">registerRDDAsTable</a></tt><tt class="py-op">(</tt><tt class="py-name">jschema_rdd</tt><tt class="py-op">,</tt> <tt class="py-name">tableName</tt><tt class="py-op">)</tt> </tt>
<a name="L122"></a><tt class="py-lineno">122</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L123"></a><tt class="py-lineno">123</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">ValueError</tt><tt class="py-op">(</tt><tt class="py-string">"Can only register SchemaRDD as table"</tt><tt class="py-op">)</tt> </tt>
</div><a name="L124"></a><tt class="py-lineno">124</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.parquetFile"></a><div id="SQLContext.parquetFile-def"><a name="L125"></a><tt class="py-lineno">125</tt> <a class="py-toggle" href="#" id="SQLContext.parquetFile-toggle" onclick="return toggle('SQLContext.parquetFile');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#parquetFile">parquetFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">path</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.parquetFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.parquetFile-expanded"><a name="L126"></a><tt class="py-lineno">126</tt>  <tt class="py-line">        <tt class="py-docstring">"""Loads a Parquet file, returning the result as a L{SchemaRDD}.</tt> </tt>
<a name="L127"></a><tt class="py-lineno">127</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L128"></a><tt class="py-lineno">128</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; import tempfile, shutil</tt> </tt>
<a name="L129"></a><tt class="py-lineno">129</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; parquetFile = tempfile.mkdtemp()</tt> </tt>
<a name="L130"></a><tt class="py-lineno">130</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; shutil.rmtree(parquetFile)</tt> </tt>
<a name="L131"></a><tt class="py-lineno">131</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L132"></a><tt class="py-lineno">132</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.saveAsParquetFile(parquetFile)</tt> </tt>
<a name="L133"></a><tt class="py-lineno">133</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.parquetFile(parquetFile)</tt> </tt>
<a name="L134"></a><tt class="py-lineno">134</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(srdd.collect()) == sorted(srdd2.collect())</tt> </tt>
<a name="L135"></a><tt class="py-lineno">135</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L136"></a><tt class="py-lineno">136</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L137"></a><tt class="py-lineno">137</tt>  <tt class="py-line">        <tt class="py-name">jschema_rdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-30" class="py-name" targets="Method pyspark.sql.SQLContext.parquetFile()=pyspark.sql.SQLContext-class.html#parquetFile"><a title="pyspark.sql.SQLContext.parquetFile" class="py-name" href="#" onclick="return doclink('link-30', 'parquetFile', 'link-30');">parquetFile</a></tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">)</tt> </tt>
<a name="L138"></a><tt class="py-lineno">138</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-31" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-31', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jschema_rdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L139"></a><tt class="py-lineno">139</tt>  <tt class="py-line"> </tt>
<a name="L140"></a><tt class="py-lineno">140</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.jsonFile"></a><div id="SQLContext.jsonFile-def"><a name="L141"></a><tt class="py-lineno">141</tt> <a class="py-toggle" href="#" id="SQLContext.jsonFile-toggle" onclick="return toggle('SQLContext.jsonFile');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#jsonFile">jsonFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">path</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.jsonFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.jsonFile-expanded"><a name="L142"></a><tt class="py-lineno">142</tt>  <tt class="py-line">        <tt class="py-docstring">"""Loads a text file storing one JSON object per line,</tt> </tt>
<a name="L143"></a><tt class="py-lineno">143</tt>  <tt class="py-line"><tt class="py-docstring">           returning the result as a L{SchemaRDD}.</tt> </tt>
<a name="L144"></a><tt class="py-lineno">144</tt>  <tt class="py-line"><tt class="py-docstring">           It goes through the entire dataset once to determine the schema.</tt> </tt>
<a name="L145"></a><tt class="py-lineno">145</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L146"></a><tt class="py-lineno">146</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; import tempfile, shutil</tt> </tt>
<a name="L147"></a><tt class="py-lineno">147</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; jsonFile = tempfile.mkdtemp()</tt> </tt>
<a name="L148"></a><tt class="py-lineno">148</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; shutil.rmtree(jsonFile)</tt> </tt>
<a name="L149"></a><tt class="py-lineno">149</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; ofn = open(jsonFile, 'w')</tt> </tt>
<a name="L150"></a><tt class="py-lineno">150</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; for json in jsonStrings:</tt> </tt>
<a name="L151"></a><tt class="py-lineno">151</tt>  <tt class="py-line"><tt class="py-docstring">        ...   print&gt;&gt;ofn, json</tt> </tt>
<a name="L152"></a><tt class="py-lineno">152</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; ofn.close()</tt> </tt>
<a name="L153"></a><tt class="py-lineno">153</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.jsonFile(jsonFile)</tt> </tt>
<a name="L154"></a><tt class="py-lineno">154</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.registerRDDAsTable(srdd, "table1")</tt> </tt>
<a name="L155"></a><tt class="py-lineno">155</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")</tt> </tt>
<a name="L156"></a><tt class="py-lineno">156</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},</tt> </tt>
<a name="L157"></a><tt class="py-lineno">157</tt>  <tt class="py-line"><tt class="py-docstring">        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},</tt> </tt>
<a name="L158"></a><tt class="py-lineno">158</tt>  <tt class="py-line"><tt class="py-docstring">        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]</tt> </tt>
<a name="L159"></a><tt class="py-lineno">159</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L160"></a><tt class="py-lineno">160</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L161"></a><tt class="py-lineno">161</tt>  <tt class="py-line">        <tt class="py-name">jschema_rdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-32" class="py-name" targets="Method pyspark.sql.SQLContext.jsonFile()=pyspark.sql.SQLContext-class.html#jsonFile"><a title="pyspark.sql.SQLContext.jsonFile" class="py-name" href="#" onclick="return doclink('link-32', 'jsonFile', 'link-32');">jsonFile</a></tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">)</tt> </tt>
<a name="L162"></a><tt class="py-lineno">162</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-33" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-33', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jschema_rdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L163"></a><tt class="py-lineno">163</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.jsonRDD"></a><div id="SQLContext.jsonRDD-def"><a name="L164"></a><tt class="py-lineno">164</tt> <a class="py-toggle" href="#" id="SQLContext.jsonRDD-toggle" onclick="return toggle('SQLContext.jsonRDD');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#jsonRDD">jsonRDD</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">rdd</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.jsonRDD-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.jsonRDD-expanded"><a name="L165"></a><tt class="py-lineno">165</tt>  <tt class="py-line">        <tt class="py-docstring">"""Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}.</tt> </tt>
<a name="L166"></a><tt class="py-lineno">166</tt>  <tt class="py-line"><tt class="py-docstring">           It goes through the entire dataset once to determine the schema.</tt> </tt>
<a name="L167"></a><tt class="py-lineno">167</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L168"></a><tt class="py-lineno">168</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.jsonRDD(json)</tt> </tt>
<a name="L169"></a><tt class="py-lineno">169</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.registerRDDAsTable(srdd, "table1")</tt> </tt>
<a name="L170"></a><tt class="py-lineno">170</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")</tt> </tt>
<a name="L171"></a><tt class="py-lineno">171</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},</tt> </tt>
<a name="L172"></a><tt class="py-lineno">172</tt>  <tt class="py-line"><tt class="py-docstring">        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},</tt> </tt>
<a name="L173"></a><tt class="py-lineno">173</tt>  <tt class="py-line"><tt class="py-docstring">        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]</tt> </tt>
<a name="L174"></a><tt class="py-lineno">174</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L175"></a><tt class="py-lineno">175</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L176"></a><tt class="py-lineno">176</tt>  <tt class="py-line">        <tt class="py-keyword">def</tt> <tt class="py-def-name">func</tt><tt class="py-op">(</tt><tt class="py-param">split</tt><tt class="py-op">,</tt> <tt class="py-param">iterator</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L177"></a><tt class="py-lineno">177</tt>  <tt class="py-line">            <tt class="py-keyword">for</tt> <tt class="py-name">x</tt> <tt class="py-keyword">in</tt> <tt class="py-name">iterator</tt><tt class="py-op">:</tt> </tt>
<a name="L178"></a><tt class="py-lineno">178</tt>  <tt class="py-line">                <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">isinstance</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">,</tt> <tt class="py-name">basestring</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L179"></a><tt class="py-lineno">179</tt>  <tt class="py-line">                    <tt class="py-name">x</tt> <tt class="py-op">=</tt> <tt class="py-name">unicode</tt><tt class="py-op">(</tt><tt class="py-name">x</tt><tt class="py-op">)</tt> </tt>
<a name="L180"></a><tt class="py-lineno">180</tt>  <tt class="py-line">                <tt class="py-keyword">yield</tt> <tt class="py-name">x</tt><tt class="py-op">.</tt><tt class="py-name">encode</tt><tt class="py-op">(</tt><tt class="py-string">"utf-8"</tt><tt class="py-op">)</tt> </tt>
</div><a name="L181"></a><tt class="py-lineno">181</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt> <tt class="py-op">=</tt> <tt class="py-name">PipelinedRDD</tt><tt class="py-op">(</tt><tt id="link-34" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-34', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">,</tt> <tt class="py-name">func</tt><tt class="py-op">)</tt> </tt>
<a name="L182"></a><tt class="py-lineno">182</tt>  <tt class="py-line">        <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_bypass_serializer</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L183"></a><tt class="py-lineno">183</tt>  <tt class="py-line">        <tt class="py-name">jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">keyed</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-35" class="py-name" targets="Method pyspark.rdd.RDD.map()=pyspark.rdd.RDD-class.html#map"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-35', 'map', 'link-35');">map</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-36" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-36', '_jvm', 'link-8');">_jvm</a></tt><tt class="py-op">.</tt><tt class="py-name">BytesToString</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L184"></a><tt class="py-lineno">184</tt>  <tt class="py-line">        <tt class="py-name">jschema_rdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-37" class="py-name" targets="Method pyspark.sql.SQLContext.jsonRDD()=pyspark.sql.SQLContext-class.html#jsonRDD"><a title="pyspark.sql.SQLContext.jsonRDD" class="py-name" href="#" onclick="return doclink('link-37', 'jsonRDD', 'link-37');">jsonRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jrdd</tt><tt class="py-op">.</tt><tt id="link-38" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-38', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
<a name="L185"></a><tt class="py-lineno">185</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-39" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-39', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jschema_rdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L186"></a><tt class="py-lineno">186</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.sql"></a><div id="SQLContext.sql-def"><a name="L187"></a><tt class="py-lineno">187</tt> <a class="py-toggle" href="#" id="SQLContext.sql-toggle" onclick="return toggle('SQLContext.sql');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#sql">sql</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">sqlQuery</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.sql-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.sql-expanded"><a name="L188"></a><tt class="py-lineno">188</tt>  <tt class="py-line">        <tt class="py-docstring">"""Return a L{SchemaRDD} representing the result of the given query.</tt> </tt>
<a name="L189"></a><tt class="py-lineno">189</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L190"></a><tt class="py-lineno">190</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L191"></a><tt class="py-lineno">191</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.registerRDDAsTable(srdd, "table1")</tt> </tt>
<a name="L192"></a><tt class="py-lineno">192</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")</tt> </tt>
<a name="L193"></a><tt class="py-lineno">193</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2.collect() == [{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"},</tt> </tt>
<a name="L194"></a><tt class="py-lineno">194</tt>  <tt class="py-line"><tt class="py-docstring">        ...                     {"f1" : 3, "f2": "row3"}]</tt> </tt>
<a name="L195"></a><tt class="py-lineno">195</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L196"></a><tt class="py-lineno">196</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L197"></a><tt class="py-lineno">197</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-40" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-40', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-41" class="py-name" targets="Module pyspark.sql=pyspark.sql-module.html,Method pyspark.sql.SQLContext.sql()=pyspark.sql.SQLContext-class.html#sql"><a title="pyspark.sql
pyspark.sql.SQLContext.sql" class="py-name" href="#" onclick="return doclink('link-41', 'sql', 'link-41');">sql</a></tt><tt class="py-op">(</tt><tt class="py-name">sqlQuery</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L198"></a><tt class="py-lineno">198</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.table"></a><div id="SQLContext.table-def"><a name="L199"></a><tt class="py-lineno">199</tt> <a class="py-toggle" href="#" id="SQLContext.table-toggle" onclick="return toggle('SQLContext.table');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#table">table</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tableName</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.table-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.table-expanded"><a name="L200"></a><tt class="py-lineno">200</tt>  <tt class="py-line">        <tt class="py-docstring">"""Returns the specified table as a L{SchemaRDD}.</tt> </tt>
<a name="L201"></a><tt class="py-lineno">201</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L202"></a><tt class="py-lineno">202</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L203"></a><tt class="py-lineno">203</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sqlCtx.registerRDDAsTable(srdd, "table1")</tt> </tt>
<a name="L204"></a><tt class="py-lineno">204</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.table("table1")</tt> </tt>
<a name="L205"></a><tt class="py-lineno">205</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(srdd.collect()) == sorted(srdd2.collect())</tt> </tt>
<a name="L206"></a><tt class="py-lineno">206</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L207"></a><tt class="py-lineno">207</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L208"></a><tt class="py-lineno">208</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-42" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-42', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-43" class="py-name" targets="Method pyspark.sql.SQLContext.table()=pyspark.sql.SQLContext-class.html#table"><a title="pyspark.sql.SQLContext.table" class="py-name" href="#" onclick="return doclink('link-43', 'table', 'link-43');">table</a></tt><tt class="py-op">(</tt><tt class="py-name">tableName</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L209"></a><tt class="py-lineno">209</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.cacheTable"></a><div id="SQLContext.cacheTable-def"><a name="L210"></a><tt class="py-lineno">210</tt> <a class="py-toggle" href="#" id="SQLContext.cacheTable-toggle" onclick="return toggle('SQLContext.cacheTable');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#cacheTable">cacheTable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tableName</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.cacheTable-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.cacheTable-expanded"><a name="L211"></a><tt class="py-lineno">211</tt>  <tt class="py-line">        <tt class="py-docstring">"""Caches the specified table in-memory."""</tt> </tt>
<a name="L212"></a><tt class="py-lineno">212</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-44" class="py-name" targets="Method pyspark.sql.SQLContext.cacheTable()=pyspark.sql.SQLContext-class.html#cacheTable"><a title="pyspark.sql.SQLContext.cacheTable" class="py-name" href="#" onclick="return doclink('link-44', 'cacheTable', 'link-44');">cacheTable</a></tt><tt class="py-op">(</tt><tt class="py-name">tableName</tt><tt class="py-op">)</tt> </tt>
</div><a name="L213"></a><tt class="py-lineno">213</tt>  <tt class="py-line"> </tt>
<a name="SQLContext.uncacheTable"></a><div id="SQLContext.uncacheTable-def"><a name="L214"></a><tt class="py-lineno">214</tt> <a class="py-toggle" href="#" id="SQLContext.uncacheTable-toggle" onclick="return toggle('SQLContext.uncacheTable');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SQLContext-class.html#uncacheTable">uncacheTable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tableName</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SQLContext.uncacheTable-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SQLContext.uncacheTable-expanded"><a name="L215"></a><tt class="py-lineno">215</tt>  <tt class="py-line">        <tt class="py-docstring">"""Removes the specified table from the in-memory cache."""</tt> </tt>
<a name="L216"></a><tt class="py-lineno">216</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-45" class="py-name" targets="Method pyspark.sql.SQLContext.uncacheTable()=pyspark.sql.SQLContext-class.html#uncacheTable"><a title="pyspark.sql.SQLContext.uncacheTable" class="py-name" href="#" onclick="return doclink('link-45', 'uncacheTable', 'link-45');">uncacheTable</a></tt><tt class="py-op">(</tt><tt class="py-name">tableName</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L217"></a><tt class="py-lineno">217</tt>  <tt class="py-line"> </tt>
<a name="HiveContext"></a><div id="HiveContext-def"><a name="L218"></a><tt class="py-lineno">218</tt>  <tt class="py-line"> </tt>
<a name="L219"></a><tt class="py-lineno">219</tt> <a class="py-toggle" href="#" id="HiveContext-toggle" onclick="return toggle('HiveContext');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.sql.HiveContext-class.html">HiveContext</a><tt class="py-op">(</tt><tt class="py-base-class">SQLContext</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="HiveContext-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="HiveContext-expanded"><a name="L220"></a><tt class="py-lineno">220</tt>  <tt class="py-line">    <tt class="py-docstring">"""A variant of Spark SQL that integrates with data stored in Hive.</tt> </tt>
<a name="L221"></a><tt class="py-lineno">221</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L222"></a><tt class="py-lineno">222</tt>  <tt class="py-line"><tt class="py-docstring">    Configuration for Hive is read from hive-site.xml on the classpath.</tt> </tt>
<a name="L223"></a><tt class="py-lineno">223</tt>  <tt class="py-line"><tt class="py-docstring">    It supports running both SQL and HiveQL commands.</tt> </tt>
<a name="L224"></a><tt class="py-lineno">224</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="L225"></a><tt class="py-lineno">225</tt>  <tt class="py-line"> </tt>
<a name="L226"></a><tt class="py-lineno">226</tt>  <tt class="py-line">    <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
<a name="HiveContext._ssql_ctx"></a><div id="HiveContext._ssql_ctx-def"><a name="L227"></a><tt class="py-lineno">227</tt> <a class="py-toggle" href="#" id="HiveContext._ssql_ctx-toggle" onclick="return toggle('HiveContext._ssql_ctx');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.HiveContext-class.html#_ssql_ctx">_ssql_ctx</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="HiveContext._ssql_ctx-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="HiveContext._ssql_ctx-expanded"><a name="L228"></a><tt class="py-lineno">228</tt>  <tt class="py-line">        <tt class="py-keyword">try</tt><tt class="py-op">:</tt> </tt>
<a name="L229"></a><tt class="py-lineno">229</tt>  <tt class="py-line">            <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">hasattr</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-string">'_scala_HiveContext'</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L230"></a><tt class="py-lineno">230</tt>  <tt class="py-line">                <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_scala_HiveContext</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_get_hive_ctx</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L231"></a><tt class="py-lineno">231</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_scala_HiveContext</tt> </tt>
<a name="L232"></a><tt class="py-lineno">232</tt>  <tt class="py-line">        <tt class="py-keyword">except</tt> <tt class="py-name">Py4JError</tt> <tt class="py-keyword">as</tt> <tt class="py-name">e</tt><tt class="py-op">:</tt> </tt>
<a name="L233"></a><tt class="py-lineno">233</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">Exception</tt><tt class="py-op">(</tt><tt class="py-string">"You must build Spark with Hive. Export 'SPARK_HIVE=true' and run "</tt> \ </tt>
<a name="L234"></a><tt class="py-lineno">234</tt>  <tt class="py-line">                            <tt class="py-string">"sbt/sbt assembly"</tt> <tt class="py-op">,</tt> <tt class="py-name">e</tt><tt class="py-op">)</tt> </tt>
</div><a name="L235"></a><tt class="py-lineno">235</tt>  <tt class="py-line"> </tt>
<a name="HiveContext._get_hive_ctx"></a><div id="HiveContext._get_hive_ctx-def"><a name="L236"></a><tt class="py-lineno">236</tt> <a class="py-toggle" href="#" id="HiveContext._get_hive_ctx-toggle" onclick="return toggle('HiveContext._get_hive_ctx');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.HiveContext-class.html#_get_hive_ctx">_get_hive_ctx</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="HiveContext._get_hive_ctx-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="HiveContext._get_hive_ctx-expanded"><a name="L237"></a><tt class="py-lineno">237</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-46" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-46', '_jvm', 'link-8');">_jvm</a></tt><tt class="py-op">.</tt><tt id="link-47" class="py-name" targets="Class pyspark.sql.HiveContext=pyspark.sql.HiveContext-class.html"><a title="pyspark.sql.HiveContext" class="py-name" href="#" onclick="return doclink('link-47', 'HiveContext', 'link-47');">HiveContext</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt><tt class="py-op">.</tt><tt class="py-name">sc</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L238"></a><tt class="py-lineno">238</tt>  <tt class="py-line"> </tt>
<a name="HiveContext.hiveql"></a><div id="HiveContext.hiveql-def"><a name="L239"></a><tt class="py-lineno">239</tt> <a class="py-toggle" href="#" id="HiveContext.hiveql-toggle" onclick="return toggle('HiveContext.hiveql');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.HiveContext-class.html#hiveql">hiveql</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">hqlQuery</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="HiveContext.hiveql-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="HiveContext.hiveql-expanded"><a name="L240"></a><tt class="py-lineno">240</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L241"></a><tt class="py-lineno">241</tt>  <tt class="py-line"><tt class="py-docstring">        Runs a query expressed in HiveQL, returning the result as a L{SchemaRDD}.</tt> </tt>
<a name="L242"></a><tt class="py-lineno">242</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L243"></a><tt class="py-lineno">243</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-48" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-48', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_ssql_ctx</tt><tt class="py-op">.</tt><tt id="link-49" class="py-name" targets="Method pyspark.sql.HiveContext.hiveql()=pyspark.sql.HiveContext-class.html#hiveql"><a title="pyspark.sql.HiveContext.hiveql" class="py-name" href="#" onclick="return doclink('link-49', 'hiveql', 'link-49');">hiveql</a></tt><tt class="py-op">(</tt><tt class="py-name">hqlQuery</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">)</tt> </tt>
</div><a name="L244"></a><tt class="py-lineno">244</tt>  <tt class="py-line"> </tt>
<a name="HiveContext.hql"></a><div id="HiveContext.hql-def"><a name="L245"></a><tt class="py-lineno">245</tt> <a class="py-toggle" href="#" id="HiveContext.hql-toggle" onclick="return toggle('HiveContext.hql');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.HiveContext-class.html#hql">hql</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">hqlQuery</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="HiveContext.hql-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="HiveContext.hql-expanded"><a name="L246"></a><tt class="py-lineno">246</tt>  <tt class="py-line">        <tt class="py-docstring">"""</tt> </tt>
<a name="L247"></a><tt class="py-lineno">247</tt>  <tt class="py-line"><tt class="py-docstring">        Runs a query expressed in HiveQL, returning the result as a L{SchemaRDD}.</tt> </tt>
<a name="L248"></a><tt class="py-lineno">248</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L249"></a><tt class="py-lineno">249</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-50" class="py-name"><a title="pyspark.sql.HiveContext.hiveql" class="py-name" href="#" onclick="return doclink('link-50', 'hiveql', 'link-49');">hiveql</a></tt><tt class="py-op">(</tt><tt class="py-name">hqlQuery</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L250"></a><tt class="py-lineno">250</tt>  <tt class="py-line"> </tt>
<a name="LocalHiveContext"></a><div id="LocalHiveContext-def"><a name="L251"></a><tt class="py-lineno">251</tt>  <tt class="py-line"> </tt>
<a name="L252"></a><tt class="py-lineno">252</tt> <a class="py-toggle" href="#" id="LocalHiveContext-toggle" onclick="return toggle('LocalHiveContext');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.sql.LocalHiveContext-class.html">LocalHiveContext</a><tt class="py-op">(</tt><tt class="py-base-class">HiveContext</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="LocalHiveContext-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="LocalHiveContext-expanded"><a name="L253"></a><tt class="py-lineno">253</tt>  <tt class="py-line">    <tt class="py-docstring">"""Starts up an instance of hive where metadata is stored locally.</tt> </tt>
<a name="L254"></a><tt class="py-lineno">254</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L255"></a><tt class="py-lineno">255</tt>  <tt class="py-line"><tt class="py-docstring">    An in-process metadata data is created with data stored in ./metadata.</tt> </tt>
<a name="L256"></a><tt class="py-lineno">256</tt>  <tt class="py-line"><tt class="py-docstring">    Warehouse data is stored in in ./warehouse.</tt> </tt>
<a name="L257"></a><tt class="py-lineno">257</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L258"></a><tt class="py-lineno">258</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; import os</tt> </tt>
<a name="L259"></a><tt class="py-lineno">259</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; hiveCtx = LocalHiveContext(sc)</tt> </tt>
<a name="L260"></a><tt class="py-lineno">260</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; try:</tt> </tt>
<a name="L261"></a><tt class="py-lineno">261</tt>  <tt class="py-line"><tt class="py-docstring">    ...     supress = hiveCtx.hql("DROP TABLE src")</tt> </tt>
<a name="L262"></a><tt class="py-lineno">262</tt>  <tt class="py-line"><tt class="py-docstring">    ... except Exception:</tt> </tt>
<a name="L263"></a><tt class="py-lineno">263</tt>  <tt class="py-line"><tt class="py-docstring">    ...     pass</tt> </tt>
<a name="L264"></a><tt class="py-lineno">264</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; kv1 = os.path.join(os.environ["SPARK_HOME"], 'examples/src/main/resources/kv1.txt')</tt> </tt>
<a name="L265"></a><tt class="py-lineno">265</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; supress = hiveCtx.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")</tt> </tt>
<a name="L266"></a><tt class="py-lineno">266</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; supress = hiveCtx.hql("LOAD DATA LOCAL INPATH '%s' INTO TABLE src" % kv1)</tt> </tt>
<a name="L267"></a><tt class="py-lineno">267</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; results = hiveCtx.hql("FROM src SELECT value").map(lambda r: int(r.value.split('_')[1]))</tt> </tt>
<a name="L268"></a><tt class="py-lineno">268</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; num = results.count()</tt> </tt>
<a name="L269"></a><tt class="py-lineno">269</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; reduce_sum = results.reduce(lambda x, y: x + y)</tt> </tt>
<a name="L270"></a><tt class="py-lineno">270</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; num</tt> </tt>
<a name="L271"></a><tt class="py-lineno">271</tt>  <tt class="py-line"><tt class="py-docstring">    500</tt> </tt>
<a name="L272"></a><tt class="py-lineno">272</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; reduce_sum</tt> </tt>
<a name="L273"></a><tt class="py-lineno">273</tt>  <tt class="py-line"><tt class="py-docstring">    130091</tt> </tt>
<a name="L274"></a><tt class="py-lineno">274</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="L275"></a><tt class="py-lineno">275</tt>  <tt class="py-line"> </tt>
<a name="LocalHiveContext._get_hive_ctx"></a><div id="LocalHiveContext._get_hive_ctx-def"><a name="L276"></a><tt class="py-lineno">276</tt> <a class="py-toggle" href="#" id="LocalHiveContext._get_hive_ctx-toggle" onclick="return toggle('LocalHiveContext._get_hive_ctx');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.LocalHiveContext-class.html#_get_hive_ctx">_get_hive_ctx</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="LocalHiveContext._get_hive_ctx-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="LocalHiveContext._get_hive_ctx-expanded"><a name="L277"></a><tt class="py-lineno">277</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-51" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-51', '_jvm', 'link-8');">_jvm</a></tt><tt class="py-op">.</tt><tt id="link-52" class="py-name" targets="Class pyspark.sql.LocalHiveContext=pyspark.sql.LocalHiveContext-class.html"><a title="pyspark.sql.LocalHiveContext" class="py-name" href="#" onclick="return doclink('link-52', 'LocalHiveContext', 'link-52');">LocalHiveContext</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt><tt class="py-op">.</tt><tt class="py-name">sc</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L278"></a><tt class="py-lineno">278</tt>  <tt class="py-line"> </tt>
<a name="TestHiveContext"></a><div id="TestHiveContext-def"><a name="L279"></a><tt class="py-lineno">279</tt>  <tt class="py-line"> </tt>
<a name="L280"></a><tt class="py-lineno">280</tt> <a class="py-toggle" href="#" id="TestHiveContext-toggle" onclick="return toggle('TestHiveContext');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.sql.TestHiveContext-class.html">TestHiveContext</a><tt class="py-op">(</tt><tt class="py-base-class">HiveContext</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="TestHiveContext-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="TestHiveContext-expanded"><a name="L281"></a><tt class="py-lineno">281</tt>  <tt class="py-line"> </tt>
<a name="TestHiveContext._get_hive_ctx"></a><div id="TestHiveContext._get_hive_ctx-def"><a name="L282"></a><tt class="py-lineno">282</tt> <a class="py-toggle" href="#" id="TestHiveContext._get_hive_ctx-toggle" onclick="return toggle('TestHiveContext._get_hive_ctx');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.TestHiveContext-class.html#_get_hive_ctx">_get_hive_ctx</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="TestHiveContext._get_hive_ctx-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="TestHiveContext._get_hive_ctx-expanded"><a name="L283"></a><tt class="py-lineno">283</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-53" class="py-name"><a title="pyspark.context.SparkContext._jvm" class="py-name" href="#" onclick="return doclink('link-53', '_jvm', 'link-8');">_jvm</a></tt><tt class="py-op">.</tt><tt id="link-54" class="py-name" targets="Class pyspark.sql.TestHiveContext=pyspark.sql.TestHiveContext-class.html"><a title="pyspark.sql.TestHiveContext" class="py-name" href="#" onclick="return doclink('link-54', 'TestHiveContext', 'link-54');">TestHiveContext</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jsc</tt><tt class="py-op">.</tt><tt class="py-name">sc</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L284"></a><tt class="py-lineno">284</tt>  <tt class="py-line"> </tt>
<a name="Row"></a><div id="Row-def"><a name="L285"></a><tt class="py-lineno">285</tt>  <tt class="py-line"> </tt>
<a name="L286"></a><tt class="py-lineno">286</tt>  <tt class="py-line"><tt class="py-comment"># TODO: Investigate if it is more efficient to use a namedtuple. One problem is that named tuples</tt> </tt>
<a name="L287"></a><tt class="py-lineno">287</tt>  <tt class="py-line"><tt class="py-comment"># are custom classes that must be generated per Schema.</tt> </tt>
<a name="L288"></a><tt class="py-lineno">288</tt> <a class="py-toggle" href="#" id="Row-toggle" onclick="return toggle('Row');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.sql.Row-class.html">Row</a><tt class="py-op">(</tt><tt class="py-base-class">dict</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="Row-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="Row-expanded"><a name="L289"></a><tt class="py-lineno">289</tt>  <tt class="py-line">    <tt class="py-docstring">"""A row in L{SchemaRDD}.</tt> </tt>
<a name="L290"></a><tt class="py-lineno">290</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L291"></a><tt class="py-lineno">291</tt>  <tt class="py-line"><tt class="py-docstring">    An extended L{dict} that takes a L{dict} in its constructor, and</tt> </tt>
<a name="L292"></a><tt class="py-lineno">292</tt>  <tt class="py-line"><tt class="py-docstring">    exposes those items as fields.</tt> </tt>
<a name="L293"></a><tt class="py-lineno">293</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L294"></a><tt class="py-lineno">294</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; r = Row({"hello" : "world", "foo" : "bar"})</tt> </tt>
<a name="L295"></a><tt class="py-lineno">295</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; r.hello</tt> </tt>
<a name="L296"></a><tt class="py-lineno">296</tt>  <tt class="py-line"><tt class="py-docstring">    'world'</tt> </tt>
<a name="L297"></a><tt class="py-lineno">297</tt>  <tt class="py-line"><tt class="py-docstring">    &gt;&gt;&gt; r.foo</tt> </tt>
<a name="L298"></a><tt class="py-lineno">298</tt>  <tt class="py-line"><tt class="py-docstring">    'bar'</tt> </tt>
<a name="L299"></a><tt class="py-lineno">299</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="L300"></a><tt class="py-lineno">300</tt>  <tt class="py-line"> </tt>
<a name="Row.__init__"></a><div id="Row.__init__-def"><a name="L301"></a><tt class="py-lineno">301</tt> <a class="py-toggle" href="#" id="Row.__init__-toggle" onclick="return toggle('Row.__init__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.Row-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">d</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="Row.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="Row.__init__-expanded"><a name="L302"></a><tt class="py-lineno">302</tt>  <tt class="py-line">        <tt class="py-name">d</tt><tt class="py-op">.</tt><tt class="py-name">update</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">__dict__</tt><tt class="py-op">)</tt> </tt>
<a name="L303"></a><tt class="py-lineno">303</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">__dict__</tt> <tt class="py-op">=</tt> <tt class="py-name">d</tt> </tt>
<a name="L304"></a><tt class="py-lineno">304</tt>  <tt class="py-line">        <tt class="py-name">dict</tt><tt class="py-op">.</tt><tt id="link-55" class="py-name" targets="Method pyspark.accumulators.Accumulator.__init__()=pyspark.accumulators.Accumulator-class.html#__init__,Method pyspark.accumulators.AddingAccumulatorParam.__init__()=pyspark.accumulators.AddingAccumulatorParam-class.html#__init__,Method pyspark.broadcast.Broadcast.__init__()=pyspark.broadcast.Broadcast-class.html#__init__,Method pyspark.conf.SparkConf.__init__()=pyspark.conf.SparkConf-class.html#__init__,Method pyspark.context.SparkContext.__init__()=pyspark.context.SparkContext-class.html#__init__,Method pyspark.files.SparkFiles.__init__()=pyspark.files.SparkFiles-class.html#__init__,Method pyspark.mllib.classification.NaiveBayesModel.__init__()=pyspark.mllib.classification.NaiveBayesModel-class.html#__init__,Method pyspark.mllib.clustering.KMeansModel.__init__()=pyspark.mllib.clustering.KMeansModel-class.html#__init__,Method pyspark.mllib.linalg.SparseVector.__init__()=pyspark.mllib.linalg.SparseVector-class.html#__init__,Method pyspark.mllib.recommendation.MatrixFactorizationModel.__init__()=pyspark.mllib.recommendation.MatrixFactorizationModel-class.html#__init__,Method pyspark.mllib.regression.LabeledPoint.__init__()=pyspark.mllib.regression.LabeledPoint-class.html#__init__,Method pyspark.mllib.regression.LinearModel.__init__()=pyspark.mllib.regression.LinearModel-class.html#__init__,Method pyspark.rdd.RDD.__init__()=pyspark.rdd.RDD-class.html#__init__,Method pyspark.resultiterable.ResultIterable.__init__()=pyspark.resultiterable.ResultIterable-class.html#__init__,Method pyspark.sql.Row.__init__()=pyspark.sql.Row-class.html#__init__,Method pyspark.sql.SQLContext.__init__()=pyspark.sql.SQLContext-class.html#__init__,Method pyspark.sql.SchemaRDD.__init__()=pyspark.sql.SchemaRDD-class.html#__init__,Method pyspark.statcounter.StatCounter.__init__()=pyspark.statcounter.StatCounter-class.html#__init__,Method pyspark.storagelevel.StorageLevel.__init__()=pyspark.storagelevel.StorageLevel-class.html#__init__"><a title="pyspark.accumulators.Accumulator.__init__
pyspark.accumulators.AddingAccumulatorParam.__init__
pyspark.broadcast.Broadcast.__init__
pyspark.conf.SparkConf.__init__
pyspark.context.SparkContext.__init__
pyspark.files.SparkFiles.__init__
pyspark.mllib.classification.NaiveBayesModel.__init__
pyspark.mllib.clustering.KMeansModel.__init__
pyspark.mllib.linalg.SparseVector.__init__
pyspark.mllib.recommendation.MatrixFactorizationModel.__init__
pyspark.mllib.regression.LabeledPoint.__init__
pyspark.mllib.regression.LinearModel.__init__
pyspark.rdd.RDD.__init__
pyspark.resultiterable.ResultIterable.__init__
pyspark.sql.Row.__init__
pyspark.sql.SQLContext.__init__
pyspark.sql.SchemaRDD.__init__
pyspark.statcounter.StatCounter.__init__
pyspark.storagelevel.StorageLevel.__init__" class="py-name" href="#" onclick="return doclink('link-55', '__init__', 'link-55');">__init__</a></tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-name">d</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L305"></a><tt class="py-lineno">305</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD"></a><div id="SchemaRDD-def"><a name="L306"></a><tt class="py-lineno">306</tt>  <tt class="py-line"> </tt>
<a name="L307"></a><tt class="py-lineno">307</tt> <a class="py-toggle" href="#" id="SchemaRDD-toggle" onclick="return toggle('SchemaRDD');">-</a><tt class="py-line"><tt class="py-keyword">class</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html">SchemaRDD</a><tt class="py-op">(</tt><tt class="py-base-class">RDD</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="SchemaRDD-expanded"><a name="L308"></a><tt class="py-lineno">308</tt>  <tt class="py-line">    <tt class="py-docstring">"""An RDD of L{Row} objects that has an associated schema.</tt> </tt>
<a name="L309"></a><tt class="py-lineno">309</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L310"></a><tt class="py-lineno">310</tt>  <tt class="py-line"><tt class="py-docstring">    The underlying JVM object is a SchemaRDD, not a PythonRDD, so we can</tt> </tt>
<a name="L311"></a><tt class="py-lineno">311</tt>  <tt class="py-line"><tt class="py-docstring">    utilize the relational query api exposed by SparkSQL.</tt> </tt>
<a name="L312"></a><tt class="py-lineno">312</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L313"></a><tt class="py-lineno">313</tt>  <tt class="py-line"><tt class="py-docstring">    For normal L{pyspark.rdd.RDD} operations (map, count, etc.) the</tt> </tt>
<a name="L314"></a><tt class="py-lineno">314</tt>  <tt class="py-line"><tt class="py-docstring">    L{SchemaRDD} is not operated on directly, as it's underlying</tt> </tt>
<a name="L315"></a><tt class="py-lineno">315</tt>  <tt class="py-line"><tt class="py-docstring">    implementation is an RDD composed of Java objects. Instead it is</tt> </tt>
<a name="L316"></a><tt class="py-lineno">316</tt>  <tt class="py-line"><tt class="py-docstring">    converted to a PythonRDD in the JVM, on which Python operations can</tt> </tt>
<a name="L317"></a><tt class="py-lineno">317</tt>  <tt class="py-line"><tt class="py-docstring">    be done.</tt> </tt>
<a name="L318"></a><tt class="py-lineno">318</tt>  <tt class="py-line"><tt class="py-docstring">    """</tt> </tt>
<a name="L319"></a><tt class="py-lineno">319</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.__init__"></a><div id="SchemaRDD.__init__-def"><a name="L320"></a><tt class="py-lineno">320</tt> <a class="py-toggle" href="#" id="SchemaRDD.__init__-toggle" onclick="return toggle('SchemaRDD.__init__');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#__init__">__init__</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">jschema_rdd</tt><tt class="py-op">,</tt> <tt class="py-param">sql_ctx</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.__init__-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.__init__-expanded"><a name="L321"></a><tt class="py-lineno">321</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt> <tt class="py-op">=</tt> <tt class="py-name">sql_ctx</tt> </tt>
<a name="L322"></a><tt class="py-lineno">322</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-56" class="py-name"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-56', '_sc', 'link-6');">_sc</a></tt> <tt class="py-op">=</tt> <tt class="py-name">sql_ctx</tt><tt class="py-op">.</tt><tt id="link-57" class="py-name"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-57', '_sc', 'link-6');">_sc</a></tt> </tt>
<a name="L323"></a><tt class="py-lineno">323</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt> <tt class="py-op">=</tt> <tt class="py-name">jschema_rdd</tt> </tt>
<a name="L324"></a><tt class="py-lineno">324</tt>  <tt class="py-line"> </tt>
<a name="L325"></a><tt class="py-lineno">325</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L326"></a><tt class="py-lineno">326</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L327"></a><tt class="py-lineno">327</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt><tt class="py-op">.</tt><tt id="link-58" class="py-name"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-58', '_sc', 'link-6');">_sc</a></tt> </tt>
<a name="L328"></a><tt class="py-lineno">328</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd_deserializer</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">serializer</tt> </tt>
</div><a name="L329"></a><tt class="py-lineno">329</tt>  <tt class="py-line"> </tt>
<a name="L330"></a><tt class="py-lineno">330</tt>  <tt class="py-line">    <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
<a name="SchemaRDD._jrdd"></a><div id="SchemaRDD._jrdd-def"><a name="L331"></a><tt class="py-lineno">331</tt> <a class="py-toggle" href="#" id="SchemaRDD._jrdd-toggle" onclick="return toggle('SchemaRDD._jrdd');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#_jrdd">_jrdd</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD._jrdd-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD._jrdd-expanded"><a name="L332"></a><tt class="py-lineno">332</tt>  <tt class="py-line">        <tt class="py-docstring">"""Lazy evaluation of PythonRDD object.</tt> </tt>
<a name="L333"></a><tt class="py-lineno">333</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L334"></a><tt class="py-lineno">334</tt>  <tt class="py-line"><tt class="py-docstring">        Only done when a user calls methods defined by the</tt> </tt>
<a name="L335"></a><tt class="py-lineno">335</tt>  <tt class="py-line"><tt class="py-docstring">        L{pyspark.rdd.RDD} super class (map, filter, etc.).</tt> </tt>
<a name="L336"></a><tt class="py-lineno">336</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L337"></a><tt class="py-lineno">337</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-keyword">not</tt> <tt class="py-name">hasattr</tt><tt class="py-op">(</tt><tt class="py-name">self</tt><tt class="py-op">,</tt> <tt class="py-string">'_lazy_jrdd'</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L338"></a><tt class="py-lineno">338</tt>  <tt class="py-line">            <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_lazy_jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_toPython</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt> </tt>
<a name="L339"></a><tt class="py-lineno">339</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_lazy_jrdd</tt> </tt>
</div><a name="L340"></a><tt class="py-lineno">340</tt>  <tt class="py-line"> </tt>
<a name="L341"></a><tt class="py-lineno">341</tt>  <tt class="py-line">    <tt class="py-decorator">@</tt><tt class="py-decorator">property</tt> </tt>
<a name="SchemaRDD._id"></a><div id="SchemaRDD._id-def"><a name="L342"></a><tt class="py-lineno">342</tt> <a class="py-toggle" href="#" id="SchemaRDD._id-toggle" onclick="return toggle('SchemaRDD._id');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#_id">_id</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD._id-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD._id-expanded"><a name="L343"></a><tt class="py-lineno">343</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jrdd</tt><tt class="py-op">.</tt><tt id="link-59" class="py-name" targets="Method pyspark.rdd.RDD.id()=pyspark.rdd.RDD-class.html#id"><a title="pyspark.rdd.RDD.id" class="py-name" href="#" onclick="return doclink('link-59', 'id', 'link-59');">id</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L344"></a><tt class="py-lineno">344</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.saveAsParquetFile"></a><div id="SchemaRDD.saveAsParquetFile-def"><a name="L345"></a><tt class="py-lineno">345</tt> <a class="py-toggle" href="#" id="SchemaRDD.saveAsParquetFile-toggle" onclick="return toggle('SchemaRDD.saveAsParquetFile');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#saveAsParquetFile">saveAsParquetFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">path</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.saveAsParquetFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.saveAsParquetFile-expanded"><a name="L346"></a><tt class="py-lineno">346</tt>  <tt class="py-line">        <tt class="py-docstring">"""Save the contents as a Parquet file, preserving the schema.</tt> </tt>
<a name="L347"></a><tt class="py-lineno">347</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L348"></a><tt class="py-lineno">348</tt>  <tt class="py-line"><tt class="py-docstring">        Files that are written out using this method can be read back in as</tt> </tt>
<a name="L349"></a><tt class="py-lineno">349</tt>  <tt class="py-line"><tt class="py-docstring">        a SchemaRDD using the L{SQLContext.parquetFile} method.</tt> </tt>
<a name="L350"></a><tt class="py-lineno">350</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L351"></a><tt class="py-lineno">351</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; import tempfile, shutil</tt> </tt>
<a name="L352"></a><tt class="py-lineno">352</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; parquetFile = tempfile.mkdtemp()</tt> </tt>
<a name="L353"></a><tt class="py-lineno">353</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; shutil.rmtree(parquetFile)</tt> </tt>
<a name="L354"></a><tt class="py-lineno">354</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L355"></a><tt class="py-lineno">355</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.saveAsParquetFile(parquetFile)</tt> </tt>
<a name="L356"></a><tt class="py-lineno">356</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.parquetFile(parquetFile)</tt> </tt>
<a name="L357"></a><tt class="py-lineno">357</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(srdd2.collect()) == sorted(srdd.collect())</tt> </tt>
<a name="L358"></a><tt class="py-lineno">358</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L359"></a><tt class="py-lineno">359</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L360"></a><tt class="py-lineno">360</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-60" class="py-name" targets="Method pyspark.sql.SchemaRDD.saveAsParquetFile()=pyspark.sql.SchemaRDD-class.html#saveAsParquetFile"><a title="pyspark.sql.SchemaRDD.saveAsParquetFile" class="py-name" href="#" onclick="return doclink('link-60', 'saveAsParquetFile', 'link-60');">saveAsParquetFile</a></tt><tt class="py-op">(</tt><tt class="py-name">path</tt><tt class="py-op">)</tt> </tt>
</div><a name="L361"></a><tt class="py-lineno">361</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.registerAsTable"></a><div id="SchemaRDD.registerAsTable-def"><a name="L362"></a><tt class="py-lineno">362</tt> <a class="py-toggle" href="#" id="SchemaRDD.registerAsTable-toggle" onclick="return toggle('SchemaRDD.registerAsTable');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#registerAsTable">registerAsTable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">name</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.registerAsTable-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.registerAsTable-expanded"><a name="L363"></a><tt class="py-lineno">363</tt>  <tt class="py-line">        <tt class="py-docstring">"""Registers this RDD as a temporary table using the given name.</tt> </tt>
<a name="L364"></a><tt class="py-lineno">364</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L365"></a><tt class="py-lineno">365</tt>  <tt class="py-line"><tt class="py-docstring">        The lifetime of this temporary table is tied to the L{SQLContext}</tt> </tt>
<a name="L366"></a><tt class="py-lineno">366</tt>  <tt class="py-line"><tt class="py-docstring">        that was used to create this SchemaRDD.</tt> </tt>
<a name="L367"></a><tt class="py-lineno">367</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L368"></a><tt class="py-lineno">368</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L369"></a><tt class="py-lineno">369</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.registerAsTable("test")</tt> </tt>
<a name="L370"></a><tt class="py-lineno">370</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd2 = sqlCtx.sql("select * from test")</tt> </tt>
<a name="L371"></a><tt class="py-lineno">371</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; sorted(srdd.collect()) == sorted(srdd2.collect())</tt> </tt>
<a name="L372"></a><tt class="py-lineno">372</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L373"></a><tt class="py-lineno">373</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L374"></a><tt class="py-lineno">374</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-61" class="py-name" targets="Method pyspark.sql.SchemaRDD.registerAsTable()=pyspark.sql.SchemaRDD-class.html#registerAsTable"><a title="pyspark.sql.SchemaRDD.registerAsTable" class="py-name" href="#" onclick="return doclink('link-61', 'registerAsTable', 'link-61');">registerAsTable</a></tt><tt class="py-op">(</tt><tt id="link-62" class="py-name" targets="Method pyspark.rdd.RDD.name()=pyspark.rdd.RDD-class.html#name"><a title="pyspark.rdd.RDD.name" class="py-name" href="#" onclick="return doclink('link-62', 'name', 'link-62');">name</a></tt><tt class="py-op">)</tt> </tt>
</div><a name="L375"></a><tt class="py-lineno">375</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.insertInto"></a><div id="SchemaRDD.insertInto-def"><a name="L376"></a><tt class="py-lineno">376</tt> <a class="py-toggle" href="#" id="SchemaRDD.insertInto-toggle" onclick="return toggle('SchemaRDD.insertInto');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#insertInto">insertInto</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tableName</tt><tt class="py-op">,</tt> <tt class="py-param">overwrite</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.insertInto-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.insertInto-expanded"><a name="L377"></a><tt class="py-lineno">377</tt>  <tt class="py-line">        <tt class="py-docstring">"""Inserts the contents of this SchemaRDD into the specified table.</tt> </tt>
<a name="L378"></a><tt class="py-lineno">378</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L379"></a><tt class="py-lineno">379</tt>  <tt class="py-line"><tt class="py-docstring">        Optionally overwriting any existing data.</tt> </tt>
<a name="L380"></a><tt class="py-lineno">380</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L381"></a><tt class="py-lineno">381</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-63" class="py-name" targets="Method pyspark.sql.SchemaRDD.insertInto()=pyspark.sql.SchemaRDD-class.html#insertInto"><a title="pyspark.sql.SchemaRDD.insertInto" class="py-name" href="#" onclick="return doclink('link-63', 'insertInto', 'link-63');">insertInto</a></tt><tt class="py-op">(</tt><tt class="py-name">tableName</tt><tt class="py-op">,</tt> <tt class="py-name">overwrite</tt><tt class="py-op">)</tt> </tt>
</div><a name="L382"></a><tt class="py-lineno">382</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.saveAsTable"></a><div id="SchemaRDD.saveAsTable-def"><a name="L383"></a><tt class="py-lineno">383</tt> <a class="py-toggle" href="#" id="SchemaRDD.saveAsTable-toggle" onclick="return toggle('SchemaRDD.saveAsTable');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#saveAsTable">saveAsTable</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">tableName</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.saveAsTable-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.saveAsTable-expanded"><a name="L384"></a><tt class="py-lineno">384</tt>  <tt class="py-line">        <tt class="py-docstring">"""Creates a new table with the contents of this SchemaRDD."""</tt> </tt>
<a name="L385"></a><tt class="py-lineno">385</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-64" class="py-name" targets="Method pyspark.sql.SchemaRDD.saveAsTable()=pyspark.sql.SchemaRDD-class.html#saveAsTable"><a title="pyspark.sql.SchemaRDD.saveAsTable" class="py-name" href="#" onclick="return doclink('link-64', 'saveAsTable', 'link-64');">saveAsTable</a></tt><tt class="py-op">(</tt><tt class="py-name">tableName</tt><tt class="py-op">)</tt> </tt>
</div><a name="L386"></a><tt class="py-lineno">386</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.schemaString"></a><div id="SchemaRDD.schemaString-def"><a name="L387"></a><tt class="py-lineno">387</tt> <a class="py-toggle" href="#" id="SchemaRDD.schemaString-toggle" onclick="return toggle('SchemaRDD.schemaString');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#schemaString">schemaString</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.schemaString-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.schemaString-expanded"><a name="L388"></a><tt class="py-lineno">388</tt>  <tt class="py-line">        <tt class="py-docstring">"""Returns the output schema in the tree format."""</tt> </tt>
<a name="L389"></a><tt class="py-lineno">389</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-65" class="py-name" targets="Method pyspark.sql.SchemaRDD.schemaString()=pyspark.sql.SchemaRDD-class.html#schemaString"><a title="pyspark.sql.SchemaRDD.schemaString" class="py-name" href="#" onclick="return doclink('link-65', 'schemaString', 'link-65');">schemaString</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L390"></a><tt class="py-lineno">390</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.printSchema"></a><div id="SchemaRDD.printSchema-def"><a name="L391"></a><tt class="py-lineno">391</tt> <a class="py-toggle" href="#" id="SchemaRDD.printSchema-toggle" onclick="return toggle('SchemaRDD.printSchema');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#printSchema">printSchema</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.printSchema-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.printSchema-expanded"><a name="L392"></a><tt class="py-lineno">392</tt>  <tt class="py-line">        <tt class="py-docstring">"""Prints out the schema in the tree format."""</tt> </tt>
<a name="L393"></a><tt class="py-lineno">393</tt>  <tt class="py-line">        <tt class="py-keyword">print</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-66" class="py-name"><a title="pyspark.sql.SchemaRDD.schemaString" class="py-name" href="#" onclick="return doclink('link-66', 'schemaString', 'link-65');">schemaString</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L394"></a><tt class="py-lineno">394</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.count"></a><div id="SchemaRDD.count-def"><a name="L395"></a><tt class="py-lineno">395</tt> <a class="py-toggle" href="#" id="SchemaRDD.count-toggle" onclick="return toggle('SchemaRDD.count');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#count">count</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.count-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.count-expanded"><a name="L396"></a><tt class="py-lineno">396</tt>  <tt class="py-line">        <tt class="py-docstring">"""Return the number of elements in this RDD.</tt> </tt>
<a name="L397"></a><tt class="py-lineno">397</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L398"></a><tt class="py-lineno">398</tt>  <tt class="py-line"><tt class="py-docstring">        Unlike the base RDD implementation of count, this implementation</tt> </tt>
<a name="L399"></a><tt class="py-lineno">399</tt>  <tt class="py-line"><tt class="py-docstring">        leverages the query optimizer to compute the count on the SchemaRDD,</tt> </tt>
<a name="L400"></a><tt class="py-lineno">400</tt>  <tt class="py-line"><tt class="py-docstring">        which supports features such as filter pushdown.</tt> </tt>
<a name="L401"></a><tt class="py-lineno">401</tt>  <tt class="py-line"><tt class="py-docstring"></tt> </tt>
<a name="L402"></a><tt class="py-lineno">402</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd = sqlCtx.inferSchema(rdd)</tt> </tt>
<a name="L403"></a><tt class="py-lineno">403</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.count()</tt> </tt>
<a name="L404"></a><tt class="py-lineno">404</tt>  <tt class="py-line"><tt class="py-docstring">        3L</tt> </tt>
<a name="L405"></a><tt class="py-lineno">405</tt>  <tt class="py-line"><tt class="py-docstring">        &gt;&gt;&gt; srdd.count() == srdd.map(lambda x: x).count()</tt> </tt>
<a name="L406"></a><tt class="py-lineno">406</tt>  <tt class="py-line"><tt class="py-docstring">        True</tt> </tt>
<a name="L407"></a><tt class="py-lineno">407</tt>  <tt class="py-line"><tt class="py-docstring">        """</tt> </tt>
<a name="L408"></a><tt class="py-lineno">408</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-67" class="py-name" targets="Method pyspark.rdd.RDD.count()=pyspark.rdd.RDD-class.html#count,Method pyspark.sql.SchemaRDD.count()=pyspark.sql.SchemaRDD-class.html#count,Method pyspark.statcounter.StatCounter.count()=pyspark.statcounter.StatCounter-class.html#count"><a title="pyspark.rdd.RDD.count
pyspark.sql.SchemaRDD.count
pyspark.statcounter.StatCounter.count" class="py-name" href="#" onclick="return doclink('link-67', 'count', 'link-67');">count</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L409"></a><tt class="py-lineno">409</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD._toPython"></a><div id="SchemaRDD._toPython-def"><a name="L410"></a><tt class="py-lineno">410</tt> <a class="py-toggle" href="#" id="SchemaRDD._toPython-toggle" onclick="return toggle('SchemaRDD._toPython');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#_toPython">_toPython</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD._toPython-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD._toPython-expanded"><a name="L411"></a><tt class="py-lineno">411</tt>  <tt class="py-line">        <tt class="py-comment"># We have to import the Row class explicitly, so that the reference Pickler has is</tt> </tt>
<a name="L412"></a><tt class="py-lineno">412</tt>  <tt class="py-line">        <tt class="py-comment"># pyspark.sql.Row instead of __main__.Row</tt> </tt>
<a name="L413"></a><tt class="py-lineno">413</tt>  <tt class="py-line">        <tt class="py-keyword">from</tt> <tt id="link-68" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-68', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-69" class="py-name"><a title="pyspark.sql
pyspark.sql.SQLContext.sql" class="py-name" href="#" onclick="return doclink('link-69', 'sql', 'link-41');">sql</a></tt> <tt class="py-keyword">import</tt> <tt id="link-70" class="py-name" targets="Class pyspark.sql.Row=pyspark.sql.Row-class.html"><a title="pyspark.sql.Row" class="py-name" href="#" onclick="return doclink('link-70', 'Row', 'link-70');">Row</a></tt> </tt>
<a name="L414"></a><tt class="py-lineno">414</tt>  <tt class="py-line">        <tt class="py-name">jrdd</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt class="py-name">javaToPython</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L415"></a><tt class="py-lineno">415</tt>  <tt class="py-line">        <tt class="py-comment"># TODO: This is inefficient, we should construct the Python Row object</tt> </tt>
<a name="L416"></a><tt class="py-lineno">416</tt>  <tt class="py-line">        <tt class="py-comment"># in Java land in the javaToPython function. May require a custom</tt> </tt>
<a name="L417"></a><tt class="py-lineno">417</tt>  <tt class="py-line">        <tt class="py-comment"># pickle serializer in Pyrolite</tt> </tt>
<a name="L418"></a><tt class="py-lineno">418</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-71" class="py-name"><a title="pyspark.rdd.RDD" class="py-name" href="#" onclick="return doclink('link-71', 'RDD', 'link-2');">RDD</a></tt><tt class="py-op">(</tt><tt class="py-name">jrdd</tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt id="link-72" class="py-name"><a title="pyspark.files.SparkFiles._sc" class="py-name" href="#" onclick="return doclink('link-72', '_sc', 'link-6');">_sc</a></tt><tt class="py-op">,</tt> <tt class="py-name">BatchedSerializer</tt><tt class="py-op">(</tt> </tt>
<a name="L419"></a><tt class="py-lineno">419</tt>  <tt class="py-line">                        <tt id="link-73" class="py-name"><a title="pyspark.serializers.PickleSerializer" class="py-name" href="#" onclick="return doclink('link-73', 'PickleSerializer', 'link-5');">PickleSerializer</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-74" class="py-name"><a title="pyspark.rdd.RDD.map" class="py-name" href="#" onclick="return doclink('link-74', 'map', 'link-35');">map</a></tt><tt class="py-op">(</tt><tt class="py-keyword">lambda</tt> <tt class="py-name">d</tt><tt class="py-op">:</tt> <tt id="link-75" class="py-name"><a title="pyspark.sql.Row" class="py-name" href="#" onclick="return doclink('link-75', 'Row', 'link-70');">Row</a></tt><tt class="py-op">(</tt><tt class="py-name">d</tt><tt class="py-op">)</tt><tt class="py-op">)</tt> </tt>
</div><a name="L420"></a><tt class="py-lineno">420</tt>  <tt class="py-line"> </tt>
<a name="L421"></a><tt class="py-lineno">421</tt>  <tt class="py-line">    <tt class="py-comment"># We override the default cache/persist/checkpoint behavior as we want to cache the underlying</tt> </tt>
<a name="L422"></a><tt class="py-lineno">422</tt>  <tt class="py-line">    <tt class="py-comment"># SchemaRDD object in the JVM, not the PythonRDD checkpointed by the super class</tt> </tt>
<a name="SchemaRDD.cache"></a><div id="SchemaRDD.cache-def"><a name="L423"></a><tt class="py-lineno">423</tt> <a class="py-toggle" href="#" id="SchemaRDD.cache-toggle" onclick="return toggle('SchemaRDD.cache');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#cache">cache</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.cache-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.cache-expanded"><a name="L424"></a><tt class="py-lineno">424</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L425"></a><tt class="py-lineno">425</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-76" class="py-name" targets="Method pyspark.rdd.RDD.cache()=pyspark.rdd.RDD-class.html#cache,Method pyspark.sql.SchemaRDD.cache()=pyspark.sql.SchemaRDD-class.html#cache"><a title="pyspark.rdd.RDD.cache
pyspark.sql.SchemaRDD.cache" class="py-name" href="#" onclick="return doclink('link-76', 'cache', 'link-76');">cache</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L426"></a><tt class="py-lineno">426</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
</div><a name="L427"></a><tt class="py-lineno">427</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.persist"></a><div id="SchemaRDD.persist-def"><a name="L428"></a><tt class="py-lineno">428</tt> <a class="py-toggle" href="#" id="SchemaRDD.persist-toggle" onclick="return toggle('SchemaRDD.persist');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#persist">persist</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">storageLevel</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.persist-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.persist-expanded"><a name="L429"></a><tt class="py-lineno">429</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L430"></a><tt class="py-lineno">430</tt>  <tt class="py-line">        <tt class="py-name">javaStorageLevel</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">ctx</tt><tt class="py-op">.</tt><tt class="py-name">_getJavaStorageLevel</tt><tt class="py-op">(</tt><tt class="py-name">storageLevel</tt><tt class="py-op">)</tt> </tt>
<a name="L431"></a><tt class="py-lineno">431</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-77" class="py-name" targets="Method pyspark.rdd.RDD.persist()=pyspark.rdd.RDD-class.html#persist,Method pyspark.sql.SchemaRDD.persist()=pyspark.sql.SchemaRDD-class.html#persist"><a title="pyspark.rdd.RDD.persist
pyspark.sql.SchemaRDD.persist" class="py-name" href="#" onclick="return doclink('link-77', 'persist', 'link-77');">persist</a></tt><tt class="py-op">(</tt><tt class="py-name">javaStorageLevel</tt><tt class="py-op">)</tt> </tt>
<a name="L432"></a><tt class="py-lineno">432</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
</div><a name="L433"></a><tt class="py-lineno">433</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.unpersist"></a><div id="SchemaRDD.unpersist-def"><a name="L434"></a><tt class="py-lineno">434</tt> <a class="py-toggle" href="#" id="SchemaRDD.unpersist-toggle" onclick="return toggle('SchemaRDD.unpersist');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#unpersist">unpersist</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.unpersist-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.unpersist-expanded"><a name="L435"></a><tt class="py-lineno">435</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_cached</tt> <tt class="py-op">=</tt> <tt class="py-name">False</tt> </tt>
<a name="L436"></a><tt class="py-lineno">436</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-78" class="py-name" targets="Method pyspark.rdd.RDD.unpersist()=pyspark.rdd.RDD-class.html#unpersist,Method pyspark.sql.SchemaRDD.unpersist()=pyspark.sql.SchemaRDD-class.html#unpersist"><a title="pyspark.rdd.RDD.unpersist
pyspark.sql.SchemaRDD.unpersist" class="py-name" href="#" onclick="return doclink('link-78', 'unpersist', 'link-78');">unpersist</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L437"></a><tt class="py-lineno">437</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt> </tt>
</div><a name="L438"></a><tt class="py-lineno">438</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.checkpoint"></a><div id="SchemaRDD.checkpoint-def"><a name="L439"></a><tt class="py-lineno">439</tt> <a class="py-toggle" href="#" id="SchemaRDD.checkpoint-toggle" onclick="return toggle('SchemaRDD.checkpoint');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#checkpoint">checkpoint</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.checkpoint-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.checkpoint-expanded"><a name="L440"></a><tt class="py-lineno">440</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">is_checkpointed</tt> <tt class="py-op">=</tt> <tt class="py-name">True</tt> </tt>
<a name="L441"></a><tt class="py-lineno">441</tt>  <tt class="py-line">        <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-79" class="py-name" targets="Method pyspark.rdd.RDD.checkpoint()=pyspark.rdd.RDD-class.html#checkpoint,Method pyspark.sql.SchemaRDD.checkpoint()=pyspark.sql.SchemaRDD-class.html#checkpoint"><a title="pyspark.rdd.RDD.checkpoint
pyspark.sql.SchemaRDD.checkpoint" class="py-name" href="#" onclick="return doclink('link-79', 'checkpoint', 'link-79');">checkpoint</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L442"></a><tt class="py-lineno">442</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.isCheckpointed"></a><div id="SchemaRDD.isCheckpointed-def"><a name="L443"></a><tt class="py-lineno">443</tt> <a class="py-toggle" href="#" id="SchemaRDD.isCheckpointed-toggle" onclick="return toggle('SchemaRDD.isCheckpointed');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#isCheckpointed">isCheckpointed</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.isCheckpointed-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.isCheckpointed-expanded"><a name="L444"></a><tt class="py-lineno">444</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-80" class="py-name" targets="Method pyspark.rdd.RDD.isCheckpointed()=pyspark.rdd.RDD-class.html#isCheckpointed,Method pyspark.sql.SchemaRDD.isCheckpointed()=pyspark.sql.SchemaRDD-class.html#isCheckpointed"><a title="pyspark.rdd.RDD.isCheckpointed
pyspark.sql.SchemaRDD.isCheckpointed" class="py-name" href="#" onclick="return doclink('link-80', 'isCheckpointed', 'link-80');">isCheckpointed</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
</div><a name="L445"></a><tt class="py-lineno">445</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.getCheckpointFile"></a><div id="SchemaRDD.getCheckpointFile-def"><a name="L446"></a><tt class="py-lineno">446</tt> <a class="py-toggle" href="#" id="SchemaRDD.getCheckpointFile-toggle" onclick="return toggle('SchemaRDD.getCheckpointFile');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#getCheckpointFile">getCheckpointFile</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.getCheckpointFile-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.getCheckpointFile-expanded"><a name="L447"></a><tt class="py-lineno">447</tt>  <tt class="py-line">        <tt class="py-name">checkpointFile</tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-81" class="py-name" targets="Method pyspark.rdd.RDD.getCheckpointFile()=pyspark.rdd.RDD-class.html#getCheckpointFile,Method pyspark.sql.SchemaRDD.getCheckpointFile()=pyspark.sql.SchemaRDD-class.html#getCheckpointFile"><a title="pyspark.rdd.RDD.getCheckpointFile
pyspark.sql.SchemaRDD.getCheckpointFile" class="py-name" href="#" onclick="return doclink('link-81', 'getCheckpointFile', 'link-81');">getCheckpointFile</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L448"></a><tt class="py-lineno">448</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-name">checkpointFile</tt><tt class="py-op">.</tt><tt class="py-name">isDefined</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L449"></a><tt class="py-lineno">449</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">checkpointFile</tt><tt class="py-op">.</tt><tt id="link-82" class="py-name" targets="Method pyspark.conf.SparkConf.get()=pyspark.conf.SparkConf-class.html#get,Class Method pyspark.files.SparkFiles.get()=pyspark.files.SparkFiles-class.html#get"><a title="pyspark.conf.SparkConf.get
pyspark.files.SparkFiles.get" class="py-name" href="#" onclick="return doclink('link-82', 'get', 'link-82');">get</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L450"></a><tt class="py-lineno">450</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L451"></a><tt class="py-lineno">451</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt class="py-name">None</tt> </tt>
</div><a name="L452"></a><tt class="py-lineno">452</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.coalesce"></a><div id="SchemaRDD.coalesce-def"><a name="L453"></a><tt class="py-lineno">453</tt> <a class="py-toggle" href="#" id="SchemaRDD.coalesce-toggle" onclick="return toggle('SchemaRDD.coalesce');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#coalesce">coalesce</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">,</tt> <tt class="py-param">shuffle</tt><tt class="py-op">=</tt><tt class="py-name">False</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.coalesce-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.coalesce-expanded"><a name="L454"></a><tt class="py-lineno">454</tt>  <tt class="py-line">        <tt id="link-83" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-83', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-84" class="py-name" targets="Method pyspark.rdd.RDD.coalesce()=pyspark.rdd.RDD-class.html#coalesce,Method pyspark.sql.SchemaRDD.coalesce()=pyspark.sql.SchemaRDD-class.html#coalesce"><a title="pyspark.rdd.RDD.coalesce
pyspark.sql.SchemaRDD.coalesce" class="py-name" href="#" onclick="return doclink('link-84', 'coalesce', 'link-84');">coalesce</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">,</tt> <tt class="py-name">shuffle</tt><tt class="py-op">)</tt> </tt>
<a name="L455"></a><tt class="py-lineno">455</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-85" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-85', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt id="link-86" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-86', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt><tt class="py-op">)</tt> </tt>
</div><a name="L456"></a><tt class="py-lineno">456</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.distinct"></a><div id="SchemaRDD.distinct-def"><a name="L457"></a><tt class="py-lineno">457</tt> <a class="py-toggle" href="#" id="SchemaRDD.distinct-toggle" onclick="return toggle('SchemaRDD.distinct');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#distinct">distinct</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.distinct-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.distinct-expanded"><a name="L458"></a><tt class="py-lineno">458</tt>  <tt class="py-line">        <tt id="link-87" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-87', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-88" class="py-name" targets="Method pyspark.rdd.RDD.distinct()=pyspark.rdd.RDD-class.html#distinct,Method pyspark.sql.SchemaRDD.distinct()=pyspark.sql.SchemaRDD-class.html#distinct"><a title="pyspark.rdd.RDD.distinct
pyspark.sql.SchemaRDD.distinct" class="py-name" href="#" onclick="return doclink('link-88', 'distinct', 'link-88');">distinct</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L459"></a><tt class="py-lineno">459</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-89" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-89', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt id="link-90" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-90', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt><tt class="py-op">)</tt> </tt>
</div><a name="L460"></a><tt class="py-lineno">460</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.intersection"></a><div id="SchemaRDD.intersection-def"><a name="L461"></a><tt class="py-lineno">461</tt> <a class="py-toggle" href="#" id="SchemaRDD.intersection-toggle" onclick="return toggle('SchemaRDD.intersection');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#intersection">intersection</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.intersection-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.intersection-expanded"><a name="L462"></a><tt class="py-lineno">462</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">__class__</tt> <tt class="py-keyword">is</tt> <tt id="link-91" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-91', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L463"></a><tt class="py-lineno">463</tt>  <tt class="py-line">            <tt id="link-92" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-92', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-93" class="py-name" targets="Method pyspark.rdd.RDD.intersection()=pyspark.rdd.RDD-class.html#intersection,Method pyspark.sql.SchemaRDD.intersection()=pyspark.sql.SchemaRDD-class.html#intersection"><a title="pyspark.rdd.RDD.intersection
pyspark.sql.SchemaRDD.intersection" class="py-name" href="#" onclick="return doclink('link-93', 'intersection', 'link-93');">intersection</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">)</tt> </tt>
<a name="L464"></a><tt class="py-lineno">464</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt id="link-94" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-94', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt id="link-95" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-95', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt><tt class="py-op">)</tt> </tt>
<a name="L465"></a><tt class="py-lineno">465</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L466"></a><tt class="py-lineno">466</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">ValueError</tt><tt class="py-op">(</tt><tt class="py-string">"Can only intersect with another SchemaRDD"</tt><tt class="py-op">)</tt> </tt>
</div><a name="L467"></a><tt class="py-lineno">467</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.repartition"></a><div id="SchemaRDD.repartition-def"><a name="L468"></a><tt class="py-lineno">468</tt> <a class="py-toggle" href="#" id="SchemaRDD.repartition-toggle" onclick="return toggle('SchemaRDD.repartition');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#repartition">repartition</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.repartition-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.repartition-expanded"><a name="L469"></a><tt class="py-lineno">469</tt>  <tt class="py-line">        <tt id="link-96" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-96', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-97" class="py-name" targets="Method pyspark.rdd.RDD.repartition()=pyspark.rdd.RDD-class.html#repartition,Method pyspark.sql.SchemaRDD.repartition()=pyspark.sql.SchemaRDD-class.html#repartition"><a title="pyspark.rdd.RDD.repartition
pyspark.sql.SchemaRDD.repartition" class="py-name" href="#" onclick="return doclink('link-97', 'repartition', 'link-97');">repartition</a></tt><tt class="py-op">(</tt><tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
<a name="L470"></a><tt class="py-lineno">470</tt>  <tt class="py-line">        <tt class="py-keyword">return</tt> <tt id="link-98" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-98', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt id="link-99" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-99', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt><tt class="py-op">)</tt> </tt>
</div><a name="L471"></a><tt class="py-lineno">471</tt>  <tt class="py-line"> </tt>
<a name="SchemaRDD.subtract"></a><div id="SchemaRDD.subtract-def"><a name="L472"></a><tt class="py-lineno">472</tt> <a class="py-toggle" href="#" id="SchemaRDD.subtract-toggle" onclick="return toggle('SchemaRDD.subtract');">-</a><tt class="py-line">    <tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql.SchemaRDD-class.html#subtract">subtract</a><tt class="py-op">(</tt><tt class="py-param">self</tt><tt class="py-op">,</tt> <tt class="py-param">other</tt><tt class="py-op">,</tt> <tt class="py-param">numPartitions</tt><tt class="py-op">=</tt><tt class="py-name">None</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="SchemaRDD.subtract-collapsed" style="display:none;" pad="+++" indent="++++++++"></div><div id="SchemaRDD.subtract-expanded"><a name="L473"></a><tt class="py-lineno">473</tt>  <tt class="py-line">        <tt class="py-keyword">if</tt> <tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">__class__</tt> <tt class="py-keyword">is</tt> <tt id="link-100" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-100', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
<a name="L474"></a><tt class="py-lineno">474</tt>  <tt class="py-line">            <tt class="py-keyword">if</tt> <tt class="py-name">numPartitions</tt> <tt class="py-keyword">is</tt> <tt class="py-name">None</tt><tt class="py-op">:</tt> </tt>
<a name="L475"></a><tt class="py-lineno">475</tt>  <tt class="py-line">                <tt id="link-101" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-101', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-102" class="py-name" targets="Method pyspark.rdd.RDD.subtract()=pyspark.rdd.RDD-class.html#subtract,Method pyspark.sql.SchemaRDD.subtract()=pyspark.sql.SchemaRDD-class.html#subtract"><a title="pyspark.rdd.RDD.subtract
pyspark.sql.SchemaRDD.subtract" class="py-name" href="#" onclick="return doclink('link-102', 'subtract', 'link-102');">subtract</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">)</tt> </tt>
<a name="L476"></a><tt class="py-lineno">476</tt>  <tt class="py-line">            <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L477"></a><tt class="py-lineno">477</tt>  <tt class="py-line">                <tt id="link-103" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-103', 'rdd', 'link-1');">rdd</a></tt> <tt class="py-op">=</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">.</tt><tt id="link-104" class="py-name"><a title="pyspark.rdd.RDD.subtract
pyspark.sql.SchemaRDD.subtract" class="py-name" href="#" onclick="return doclink('link-104', 'subtract', 'link-102');">subtract</a></tt><tt class="py-op">(</tt><tt class="py-name">other</tt><tt class="py-op">.</tt><tt class="py-name">_jschema_rdd</tt><tt class="py-op">,</tt> <tt class="py-name">numPartitions</tt><tt class="py-op">)</tt> </tt>
<a name="L478"></a><tt class="py-lineno">478</tt>  <tt class="py-line">            <tt class="py-keyword">return</tt> <tt id="link-105" class="py-name"><a title="pyspark.sql.SchemaRDD" class="py-name" href="#" onclick="return doclink('link-105', 'SchemaRDD', 'link-15');">SchemaRDD</a></tt><tt class="py-op">(</tt><tt id="link-106" class="py-name"><a title="pyspark.rdd" class="py-name" href="#" onclick="return doclink('link-106', 'rdd', 'link-1');">rdd</a></tt><tt class="py-op">,</tt> <tt class="py-name">self</tt><tt class="py-op">.</tt><tt class="py-name">sql_ctx</tt><tt class="py-op">)</tt> </tt>
<a name="L479"></a><tt class="py-lineno">479</tt>  <tt class="py-line">        <tt class="py-keyword">else</tt><tt class="py-op">:</tt> </tt>
<a name="L480"></a><tt class="py-lineno">480</tt>  <tt class="py-line">            <tt class="py-keyword">raise</tt> <tt class="py-name">ValueError</tt><tt class="py-op">(</tt><tt class="py-string">"Can only subtract another SchemaRDD"</tt><tt class="py-op">)</tt> </tt>
</div></div><a name="L481"></a><tt class="py-lineno">481</tt>  <tt class="py-line"> </tt>
<a name="_test"></a><div id="_test-def"><a name="L482"></a><tt class="py-lineno">482</tt> <a class="py-toggle" href="#" id="_test-toggle" onclick="return toggle('_test');">-</a><tt class="py-line"><tt class="py-keyword">def</tt> <a class="py-def-name" href="pyspark.sql-module.html#_test">_test</a><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">:</tt> </tt>
</div><div id="_test-collapsed" style="display:none;" pad="+++" indent="++++"></div><div id="_test-expanded"><a name="L483"></a><tt class="py-lineno">483</tt>  <tt class="py-line">    <tt class="py-keyword">import</tt> <tt class="py-name">doctest</tt> </tt>
<a name="L484"></a><tt class="py-lineno">484</tt>  <tt class="py-line">    <tt class="py-keyword">from</tt> <tt class="py-name">array</tt> <tt class="py-keyword">import</tt> <tt class="py-name">array</tt> </tt>
<a name="L485"></a><tt class="py-lineno">485</tt>  <tt class="py-line">    <tt class="py-keyword">from</tt> <tt id="link-107" class="py-name"><a title="pyspark" class="py-name" href="#" onclick="return doclink('link-107', 'pyspark', 'link-0');">pyspark</a></tt><tt class="py-op">.</tt><tt id="link-108" class="py-name" targets="Module pyspark.context=pyspark.context-module.html,Method pyspark.rdd.RDD.context()=pyspark.rdd.RDD-class.html#context"><a title="pyspark.context
pyspark.rdd.RDD.context" class="py-name" href="#" onclick="return doclink('link-108', 'context', 'link-108');">context</a></tt> <tt class="py-keyword">import</tt> <tt id="link-109" class="py-name" targets="Class pyspark.context.SparkContext=pyspark.context.SparkContext-class.html"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-109', 'SparkContext', 'link-109');">SparkContext</a></tt> </tt>
<a name="L486"></a><tt class="py-lineno">486</tt>  <tt class="py-line">    <tt class="py-name">globs</tt> <tt class="py-op">=</tt> <tt class="py-name">globals</tt><tt class="py-op">(</tt><tt class="py-op">)</tt><tt class="py-op">.</tt><tt id="link-110" class="py-name" targets="Method pyspark.statcounter.StatCounter.copy()=pyspark.statcounter.StatCounter-class.html#copy"><a title="pyspark.statcounter.StatCounter.copy" class="py-name" href="#" onclick="return doclink('link-110', 'copy', 'link-110');">copy</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L487"></a><tt class="py-lineno">487</tt>  <tt class="py-line">    <tt class="py-comment"># The small batch size here ensures that we see multiple batches,</tt> </tt>
<a name="L488"></a><tt class="py-lineno">488</tt>  <tt class="py-line">    <tt class="py-comment"># even in these small test examples:</tt> </tt>
<a name="L489"></a><tt class="py-lineno">489</tt>  <tt class="py-line">    <tt class="py-name">sc</tt> <tt class="py-op">=</tt> <tt id="link-111" class="py-name"><a title="pyspark.context.SparkContext" class="py-name" href="#" onclick="return doclink('link-111', 'SparkContext', 'link-109');">SparkContext</a></tt><tt class="py-op">(</tt><tt class="py-string">'local[4]'</tt><tt class="py-op">,</tt> <tt class="py-string">'PythonTest'</tt><tt class="py-op">,</tt> <tt class="py-name">batchSize</tt><tt class="py-op">=</tt><tt class="py-number">2</tt><tt class="py-op">)</tt> </tt>
<a name="L490"></a><tt class="py-lineno">490</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt> </tt>
<a name="L491"></a><tt class="py-lineno">491</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sqlCtx'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt id="link-112" class="py-name"><a title="pyspark.sql.SQLContext" class="py-name" href="#" onclick="return doclink('link-112', 'SQLContext', 'link-13');">SQLContext</a></tt><tt class="py-op">(</tt><tt class="py-name">sc</tt><tt class="py-op">)</tt> </tt>
<a name="L492"></a><tt class="py-lineno">492</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'rdd'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt><tt class="py-op">.</tt><tt id="link-113" class="py-name" targets="Method pyspark.context.SparkContext.parallelize()=pyspark.context.SparkContext-class.html#parallelize"><a title="pyspark.context.SparkContext.parallelize" class="py-name" href="#" onclick="return doclink('link-113', 'parallelize', 'link-113');">parallelize</a></tt><tt class="py-op">(</tt><tt class="py-op">[</tt><tt class="py-op">{</tt><tt class="py-string">"field1"</tt> <tt class="py-op">:</tt> <tt class="py-number">1</tt><tt class="py-op">,</tt> <tt class="py-string">"field2"</tt> <tt class="py-op">:</tt> <tt class="py-string">"row1"</tt><tt class="py-op">}</tt><tt class="py-op">,</tt> </tt>
<a name="L493"></a><tt class="py-lineno">493</tt>  <tt class="py-line">        <tt class="py-op">{</tt><tt class="py-string">"field1"</tt> <tt class="py-op">:</tt> <tt class="py-number">2</tt><tt class="py-op">,</tt> <tt class="py-string">"field2"</tt><tt class="py-op">:</tt> <tt class="py-string">"row2"</tt><tt class="py-op">}</tt><tt class="py-op">,</tt> <tt class="py-op">{</tt><tt class="py-string">"field1"</tt> <tt class="py-op">:</tt> <tt class="py-number">3</tt><tt class="py-op">,</tt> <tt class="py-string">"field2"</tt><tt class="py-op">:</tt> <tt class="py-string">"row3"</tt><tt class="py-op">}</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
<a name="L494"></a><tt class="py-lineno">494</tt>  <tt class="py-line">    <tt class="py-name">jsonStrings</tt> <tt class="py-op">=</tt> <tt class="py-op">[</tt><tt class="py-string">'{"field1": 1, "field2": "row1", "field3":{"field4":11}}'</tt><tt class="py-op">,</tt> </tt>
<a name="L495"></a><tt class="py-lineno">495</tt>  <tt class="py-line">       <tt class="py-string">'{"field1" : 2, "field2": "row2", "field3":{"field4":22}}'</tt><tt class="py-op">,</tt> </tt>
<a name="L496"></a><tt class="py-lineno">496</tt>  <tt class="py-line">       <tt class="py-string">'{"field1" : 3, "field2": "row3", "field3":{"field4":33}}'</tt><tt class="py-op">]</tt> </tt>
<a name="L497"></a><tt class="py-lineno">497</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'jsonStrings'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">jsonStrings</tt> </tt>
<a name="L498"></a><tt class="py-lineno">498</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'json'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt><tt class="py-op">.</tt><tt id="link-114" class="py-name"><a title="pyspark.context.SparkContext.parallelize" class="py-name" href="#" onclick="return doclink('link-114', 'parallelize', 'link-113');">parallelize</a></tt><tt class="py-op">(</tt><tt class="py-name">jsonStrings</tt><tt class="py-op">)</tt> </tt>
<a name="L499"></a><tt class="py-lineno">499</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'nestedRdd1'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt><tt class="py-op">.</tt><tt id="link-115" class="py-name"><a title="pyspark.context.SparkContext.parallelize" class="py-name" href="#" onclick="return doclink('link-115', 'parallelize', 'link-113');">parallelize</a></tt><tt class="py-op">(</tt><tt class="py-op">[</tt> </tt>
<a name="L500"></a><tt class="py-lineno">500</tt>  <tt class="py-line">        <tt class="py-op">{</tt><tt class="py-string">"f1"</tt> <tt class="py-op">:</tt> <tt class="py-name">array</tt><tt class="py-op">(</tt><tt class="py-string">'i'</tt><tt class="py-op">,</tt> <tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">,</tt> <tt class="py-number">2</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-string">"f2"</tt> <tt class="py-op">:</tt> <tt class="py-op">{</tt><tt class="py-string">"row1"</tt> <tt class="py-op">:</tt> <tt class="py-number">1.0</tt><tt class="py-op">}</tt><tt class="py-op">}</tt><tt class="py-op">,</tt> </tt>
<a name="L501"></a><tt class="py-lineno">501</tt>  <tt class="py-line">        <tt class="py-op">{</tt><tt class="py-string">"f1"</tt> <tt class="py-op">:</tt> <tt class="py-name">array</tt><tt class="py-op">(</tt><tt class="py-string">'i'</tt><tt class="py-op">,</tt> <tt class="py-op">[</tt><tt class="py-number">2</tt><tt class="py-op">,</tt> <tt class="py-number">3</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-string">"f2"</tt> <tt class="py-op">:</tt> <tt class="py-op">{</tt><tt class="py-string">"row2"</tt> <tt class="py-op">:</tt> <tt class="py-number">2.0</tt><tt class="py-op">}</tt><tt class="py-op">}</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
<a name="L502"></a><tt class="py-lineno">502</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'nestedRdd2'</tt><tt class="py-op">]</tt> <tt class="py-op">=</tt> <tt class="py-name">sc</tt><tt class="py-op">.</tt><tt id="link-116" class="py-name"><a title="pyspark.context.SparkContext.parallelize" class="py-name" href="#" onclick="return doclink('link-116', 'parallelize', 'link-113');">parallelize</a></tt><tt class="py-op">(</tt><tt class="py-op">[</tt> </tt>
<a name="L503"></a><tt class="py-lineno">503</tt>  <tt class="py-line">        <tt class="py-op">{</tt><tt class="py-string">"f1"</tt> <tt class="py-op">:</tt> <tt class="py-op">[</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">,</tt> <tt class="py-number">2</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-op">[</tt><tt class="py-number">2</tt><tt class="py-op">,</tt> <tt class="py-number">3</tt><tt class="py-op">]</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-string">"f2"</tt> <tt class="py-op">:</tt> <tt id="link-117" class="py-name" targets="Method pyspark.conf.SparkConf.set()=pyspark.conf.SparkConf-class.html#set"><a title="pyspark.conf.SparkConf.set" class="py-name" href="#" onclick="return doclink('link-117', 'set', 'link-117');">set</a></tt><tt class="py-op">(</tt><tt class="py-op">[</tt><tt class="py-number">1</tt><tt class="py-op">,</tt> <tt class="py-number">2</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-string">"f3"</tt> <tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-number">1</tt><tt class="py-op">,</tt> <tt class="py-number">2</tt><tt class="py-op">)</tt><tt class="py-op">}</tt><tt class="py-op">,</tt> </tt>
<a name="L504"></a><tt class="py-lineno">504</tt>  <tt class="py-line">        <tt class="py-op">{</tt><tt class="py-string">"f1"</tt> <tt class="py-op">:</tt> <tt class="py-op">[</tt><tt class="py-op">[</tt><tt class="py-number">2</tt><tt class="py-op">,</tt> <tt class="py-number">3</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-op">[</tt><tt class="py-number">3</tt><tt class="py-op">,</tt> <tt class="py-number">4</tt><tt class="py-op">]</tt><tt class="py-op">]</tt><tt class="py-op">,</tt> <tt class="py-string">"f2"</tt> <tt class="py-op">:</tt> <tt id="link-118" class="py-name"><a title="pyspark.conf.SparkConf.set" class="py-name" href="#" onclick="return doclink('link-118', 'set', 'link-117');">set</a></tt><tt class="py-op">(</tt><tt class="py-op">[</tt><tt class="py-number">2</tt><tt class="py-op">,</tt> <tt class="py-number">3</tt><tt class="py-op">]</tt><tt class="py-op">)</tt><tt class="py-op">,</tt> <tt class="py-string">"f3"</tt> <tt class="py-op">:</tt> <tt class="py-op">(</tt><tt class="py-number">2</tt><tt class="py-op">,</tt> <tt class="py-number">3</tt><tt class="py-op">)</tt><tt class="py-op">}</tt><tt class="py-op">]</tt><tt class="py-op">)</tt> </tt>
<a name="L505"></a><tt class="py-lineno">505</tt>  <tt class="py-line">    <tt class="py-op">(</tt><tt class="py-name">failure_count</tt><tt class="py-op">,</tt> <tt class="py-name">test_count</tt><tt class="py-op">)</tt> <tt class="py-op">=</tt> <tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">testmod</tt><tt class="py-op">(</tt><tt class="py-name">globs</tt><tt class="py-op">=</tt><tt class="py-name">globs</tt><tt class="py-op">,</tt><tt class="py-name">optionflags</tt><tt class="py-op">=</tt><tt class="py-name">doctest</tt><tt class="py-op">.</tt><tt class="py-name">ELLIPSIS</tt><tt class="py-op">)</tt> </tt>
<a name="L506"></a><tt class="py-lineno">506</tt>  <tt class="py-line">    <tt class="py-name">globs</tt><tt class="py-op">[</tt><tt class="py-string">'sc'</tt><tt class="py-op">]</tt><tt class="py-op">.</tt><tt id="link-119" class="py-name" targets="Method pyspark.context.SparkContext.stop()=pyspark.context.SparkContext-class.html#stop"><a title="pyspark.context.SparkContext.stop" class="py-name" href="#" onclick="return doclink('link-119', 'stop', 'link-119');">stop</a></tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L507"></a><tt class="py-lineno">507</tt>  <tt class="py-line">    <tt class="py-keyword">if</tt> <tt class="py-name">failure_count</tt><tt class="py-op">:</tt> </tt>
<a name="L508"></a><tt class="py-lineno">508</tt>  <tt class="py-line">        <tt class="py-name">exit</tt><tt class="py-op">(</tt><tt class="py-op">-</tt><tt class="py-number">1</tt><tt class="py-op">)</tt> </tt>
</div><a name="L509"></a><tt class="py-lineno">509</tt>  <tt class="py-line"> </tt>
<a name="L510"></a><tt class="py-lineno">510</tt>  <tt class="py-line"> </tt>
<a name="L511"></a><tt class="py-lineno">511</tt>  <tt class="py-line"><tt class="py-keyword">if</tt> <tt class="py-name">__name__</tt> <tt class="py-op">==</tt> <tt class="py-string">"__main__"</tt><tt class="py-op">:</tt> </tt>
<a name="L512"></a><tt class="py-lineno">512</tt>  <tt class="py-line">    <tt class="py-name">_test</tt><tt class="py-op">(</tt><tt class="py-op">)</tt> </tt>
<a name="L513"></a><tt class="py-lineno">513</tt>  <tt class="py-line"> </tt><script type="text/javascript">
<!--
expandto(location.href);
// -->
</script>
</pre>
<br />
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://spark.apache.org">Spark 1.0.1 Python API Docs</a></th>
          </tr></table></th>
  </tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="100%%">
  <tr>
    <td align="left" class="footer">
    Generated by Epydoc 3.0.1 on Fri Jul  4 18:52:26 2014
    </td>
    <td align="right" class="footer">
      <a target="mainFrame" href="http://epydoc.sourceforge.net"
        >http://epydoc.sourceforge.net</a>
    </td>
  </tr>
</table>

<script type="text/javascript">
  <!--
  // Private objects are initially displayed (because if
  // javascript is turned off then we want them to be
  // visible); but by default, we want to hide them.  So hide
  // them unless we have a cookie that says to show them.
  checkCookie();
  // -->
</script>
</body>
</html>