summaryrefslogtreecommitdiff
path: root/site/releases/spark-release-1-5-0.html
blob: 9dd120bda36413efc7c49806fb99c56875d8d42b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">

  <title>
     Spark Release 1.5.0 | Apache Spark
    
  </title>

  

  

  <!-- Bootstrap core CSS -->
  <link href="/css/cerulean.min.css" rel="stylesheet">
  <link href="/css/custom.css" rel="stylesheet">

  <!-- Code highlighter CSS -->
  <link href="/css/pygments-default.css" rel="stylesheet">

  <script type="text/javascript">
  <!-- Google Analytics initialization -->
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-32518208-2']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();

  <!-- Adds slight delay to links to allow async reporting -->
  function trackOutboundLink(link, category, action) {
    try {
      _gaq.push(['_trackEvent', category , action]);
    } catch(err){}

    setTimeout(function() {
      document.location.href = link.href;
    }, 100);
  }
  </script>

  <!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
  <!--[if lt IE 9]>
  <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
  <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
  <![endif]-->
</head>

<body>

<script src="https://code.jquery.com/jquery.js"></script>
<script src="//netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script>
<script src="/js/lang-tabs.js"></script>
<script src="/js/downloads.js"></script>

<div class="container" style="max-width: 1200px;">

<div class="masthead">
  
    <p class="lead">
      <a href="/">
      <img src="/images/spark-logo-trademark.png"
        style="height:100px; width:auto; vertical-align: bottom; margin-top: 20px;"></a><span class="tagline">
          Lightning-fast cluster computing
      </span>
    </p>
  
</div>

<nav class="navbar navbar-default" role="navigation">
  <!-- Brand and toggle get grouped for better mobile display -->
  <div class="navbar-header">
    <button type="button" class="navbar-toggle" data-toggle="collapse"
            data-target="#navbar-collapse-1">
      <span class="sr-only">Toggle navigation</span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
    </button>
  </div>

  <!-- Collect the nav links, forms, and other content for toggling -->
  <div class="collapse navbar-collapse" id="navbar-collapse-1">
    <ul class="nav navbar-nav">
      <li><a href="/downloads.html">Download</a></li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Libraries <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/sql/">SQL and DataFrames</a></li>
          <li><a href="/streaming/">Spark Streaming</a></li>
          <li><a href="/mllib/">MLlib (machine learning)</a></li>
          <li><a href="/graphx/">GraphX (graph)</a></li>
          <li class="divider"></li>
          <li><a href="http://spark-packages.org">Third-Party Packages</a></li>
        </ul>
      </li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Documentation <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/docs/latest/">Latest Release (Spark 2.0.0)</a></li>
          <li><a href="/documentation.html">Older Versions and Other Resources</a></li>
        </ul>
      </li>
      <li><a href="/examples.html">Examples</a></li>
      <li class="dropdown">
        <a href="/community.html" class="dropdown-toggle" data-toggle="dropdown">
          Community <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/community.html">Mailing Lists</a></li>
          <li><a href="/community.html#events">Events and Meetups</a></li>
          <li><a href="/community.html#history">Project History</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Powered+By+Spark">Powered By</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Committers">Project Committers</a></li>
          <li><a href="https://issues.apache.org/jira/browse/SPARK">Issue Tracker</a></li>
        </ul>
      </li>
      <li><a href="/faq.html">FAQ</a></li>
    </ul>
    <ul class="nav navbar-nav navbar-right">
      <li class="dropdown">
        <a href="http://www.apache.org/" class="dropdown-toggle" data-toggle="dropdown">
          Apache Software Foundation <b class="caret"></b></a>
        <ul class="dropdown-menu">
          <li><a href="http://www.apache.org/">Apache Homepage</a></li>
          <li><a href="http://www.apache.org/licenses/">License</a></li>
          <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
          <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
          <li><a href="http://www.apache.org/security/">Security</a></li>
        </ul>
      </li>
    </ul>
  </div>
  <!-- /.navbar-collapse -->
</nav>


<div class="row">
  <div class="col-md-3 col-md-push-9">
    <div class="news" style="margin-bottom: 20px;">
      <h5>Latest News</h5>
      <ul class="list-unstyled">
        
          <li><a href="/news/spark-2-0-0-released.html">Spark 2.0.0 released</a>
          <span class="small">(Jul 27, 2016)</span></li>
        
          <li><a href="/news/spark-1-6-2-released.html">Spark 1.6.2 released</a>
          <span class="small">(Jun 25, 2016)</span></li>
        
          <li><a href="/news/submit-talks-to-spark-summit-eu-2016.html">Call for Presentations for Spark Summit EU is Open</a>
          <span class="small">(Jun 16, 2016)</span></li>
        
          <li><a href="/news/spark-2.0.0-preview.html">Preview release of Spark 2.0</a>
          <span class="small">(May 26, 2016)</span></li>
        
      </ul>
      <p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
    </div>
    <div class="hidden-xs hidden-sm">
      <a href="/downloads.html" class="btn btn-success btn-lg btn-block" style="margin-bottom: 30px;">
        Download Spark
      </a>
      <p style="font-size: 16px; font-weight: 500; color: #555;">
        Built-in Libraries:
      </p>
      <ul class="list-none">
        <li><a href="/sql/">SQL and DataFrames</a></li>
        <li><a href="/streaming/">Spark Streaming</a></li>
        <li><a href="/mllib/">MLlib (machine learning)</a></li>
        <li><a href="/graphx/">GraphX (graph)</a></li>
      </ul>
      <a href="http://spark-packages.org">Third-Party Packages</a>
    </div>
  </div>

  <div class="col-md-9 col-md-pull-3">
    <h2>Spark Release 1.5.0</h2>


<p>Spark 1.5.0 is the sixth release on the 1.x line. This release represents 1400+ patches from 230+ contributors and 80+ institutions. To download Spark 1.5.0 visit the <a href="/downloads.html">downloads</a> page.</p>

<p>You can consult JIRA for the <a href="https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=12315420&amp;version=12332078">detailed changes</a>. We have curated a list of high level changes here:</p>

<ul id="markdown-toc">
  <li><a href="#apis-rdd-dataframe-and-sql">APIs: RDD, DataFrame and SQL</a></li>
  <li><a href="#backend-execution-dataframe-and-sql">Backend Execution: DataFrame and SQL</a></li>
  <li><a href="#integrations-data-sources-hive-hadoop-mesos-and-cluster-management">Integrations: Data Sources, Hive, Hadoop, Mesos and Cluster Management</a></li>
  <li><a href="#r-language">R Language</a></li>
  <li><a href="#machine-learning-and-advanced-analytics">Machine Learning and Advanced Analytics</a></li>
  <li><a href="#spark-streaming">Spark Streaming</a></li>
  <li><a href="#deprecations-removals-configs-and-behavior-changes">Deprecations, Removals, Configs, and Behavior Changes</a>    <ul>
      <li><a href="#spark-core">Spark Core</a></li>
      <li><a href="#spark-sql--dataframes">Spark SQL &amp; DataFrames</a></li>
      <li><a href="#spark-streaming-1">Spark Streaming</a></li>
      <li><a href="#mllib">MLlib</a></li>
    </ul>
  </li>
  <li><a href="#known-issues">Known Issues</a>    <ul>
      <li><a href="#sqldataframe">SQL/DataFrame</a></li>
      <li><a href="#streaming">Streaming</a></li>
    </ul>
  </li>
  <li><a href="#credits">Credits</a></li>
</ul>

<h3 id="apis-rdd-dataframe-and-sql">APIs: RDD, DataFrame and SQL</h3>

<ul>
  <li>Consistent resolution of column names (see Behavior Changes section)</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-3947">SPARK-3947</a>: New experimental user-defined aggregate function (UDAF) interface</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8300">SPARK-8300</a>: DataFrame hint for broadcast joins</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8668">SPARK-8668</a>: expr function for turning a SQL expression into a DataFrame column</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9076">SPARK-9076</a>:  Improved support for NaN values
    <ul>
      <li>NaN functions: isnan, nanvl</li>
      <li>dropna/fillna also fill/drop NaN values in addition to NULL values</li>
      <li>Equality test on NaN = NaN returns true</li>
      <li>NaN is greater than all other values</li>
      <li>In aggregation, NaN values go into one group</li>
    </ul>
  </li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8828">SPARK-8828</a>: Sum function returns null when all input values are nulls</li>
  <li>Data types
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8943">SPARK-8943</a>: CalendarIntervalType for time intervals</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-7937">SPARK-7937</a>: Support ordering on StructType</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8866">SPARK-8866</a>: TimestampType’s precision is reduced to 1 microseconds (1us)</li>
    </ul>
  </li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8159">SPARK-8159</a>: Added ~100 functions, including <strong>date/time</strong>, <strong>string</strong>, <strong>math</strong>.</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8947">SPARK-8947</a>: Improved type coercion and error reporting in plan analysis phase (i.e. most errors should be reported in analysis time, rather than execution time)</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-1855">SPARK-1855</a>: Memory and local disk only checkpointing support</li>
</ul>

<h3 id="backend-execution-dataframe-and-sql">Backend Execution: DataFrame and SQL</h3>

<ul>
  <li><strong>Code generation on by default</strong> for almost all DataFrame/SQL functions</li>
  <li><strong>Improved aggregation</strong> execution in DataFrame/SQL
    <ul>
      <li>Cache friendly in-memory hash map layout</li>
      <li>Fallback to external-sort-based aggregation when memory is exhausted</li>
      <li>Code generation on by default for aggregations</li>
    </ul>
  </li>
  <li><strong>Improved join</strong> execution in DataFrame/SQL
    <ul>
      <li>Prefer (external) sort-merge join over hash join in shuffle joins (for left/right outer and inner joins), i.e. join data size is now bounded by disk rather than memory</li>
      <li>Support using (external) sort-merge join method for left/right outer joins</li>
      <li>Support for broadcast outer join</li>
    </ul>
  </li>
  <li><strong>Improved sort</strong> execution in DataFrame/SQL
    <ul>
      <li>Cache-friendly in-memory layout for sorting</li>
      <li>Fallback to external sorting when data exceeds memory size</li>
      <li>Code generated comparator for fast comparisons</li>
    </ul>
  </li>
  <li><strong>Native memory management &amp; representation</strong>
    <ul>
      <li>Compact binary in-memory data representation, leading to lower memory usage</li>
      <li>Execution memory is explicitly accounted for, without relying on JVM GC, leading to less GC and more robust memory management</li>
    </ul>
  </li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8638">SPARK-8638</a>: <strong>Improved performance &amp; memory usage in window functions</strong></li>
  <li><strong>Metrics instrumentation, reporting, and visualization</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8856">SPARK-8856</a>: Plan visualization for DataFrame/SQL</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8735">SPARK-8735</a>: Expose metrics for runtime memory usage in web UI</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-4598">SPARK-4598</a>: Pagination for jobs with large number of tasks in web UI</li>
    </ul>
  </li>
</ul>

<h3 id="integrations-data-sources-hive-hadoop-mesos-and-cluster-management">Integrations: Data Sources, Hive, Hadoop, Mesos and Cluster Management</h3>

<ul>
  <li><strong>Mesos</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-6284">SPARK-6284</a>: Support framework authentication and Mesos roles</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-6287">SPARK-6287</a>: Dynamic allocation in Mesos coarse-grained mode</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-6707">SPARK-6707</a>: User specified constraints on Mesos slave attributes</li>
    </ul>
  </li>
  <li><strong>YARN</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-4352">SPARK-4352</a>: Dynamic allocation in YARN works with preferred locations</li>
    </ul>
  </li>
  <li><strong>Standalone Cluster Manager</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-4751">SPARK-4751</a>: Dynamic resource allocation support</li>
    </ul>
  </li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-6906">SPARK-6906</a>: Improved <strong>Hive and metastore support</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8131">SPARK-8131</a>: Improved Hive database support</li>
      <li>Upgraded Hive dependency Hive 1.2</li>
      <li>Support connecting to Hive 0.13, 0.14, 1.0/0.14.1, 1.1, 1.2 metastore</li>
      <li>Support partition pruning pushdown into the metastore (off by default; config flag spark.sql.hive.metastorePartitionPruning)</li>
      <li>Support persisting data in Hive compatible format in metastore</li>
    </ul>
  </li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9381">SPARK-9381</a>: Support data partitioning for <strong>JSON</strong> data sources</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-5463">SPARK-5463</a>: <strong>Parquet</strong> improvements
    <ul>
      <li>Upgrade to Parquet 1.7</li>
      <li>Speedup metadata discovery and schema merging</li>
      <li>Predicate pushdown on by default</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-6774">SPARK-6774</a>: Support for reading non-standard legacy Parquet files generated by various libraries/systems by fully implementing all backwards-compatibility rules defined in parquet-format spec</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-4176">SPARK-4176</a>: Support for writing decimal values with precision greater than 18</li>
    </ul>
  </li>
  <li><strong>ORC</strong> improvements (various bug fixes)</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8890">SPARK-8890</a>: Faster and more robust <strong>dynamic partition insert</strong></li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9486">SPARK-9486</a>: DataSourceRegister interface for external data sources to specify short names</li>
</ul>

<h3 id="r-language">R Language</h3>

<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-6797">SPARK-6797</a>: Support for <strong>YARN cluster mode in R</strong></li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-6805">SPARK-6805</a>: <strong>GLMs with R formula</strong>, binomial/Gaussian families, and elastic-net regularization</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8742">SPARK-8742</a>: Improved error messages for R</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9315">SPARK-9315</a>: Aliases to make DataFrame functions more R-like</li>
</ul>

<h3 id="machine-learning-and-advanced-analytics">Machine Learning and Advanced Analytics</h3>

<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8521">SPARK-8521</a>: <strong>New Feature transformers</strong>:  CountVectorizer, Discrete Cosine transformation, MinMaxScaler, NGram, PCA, RFormula, StopWordsRemover, and VectorSlicer.</li>
  <li><strong>New Estimators in Pipeline API</strong>: <a href="https://issues.apache.org/jira/browse/SPARK-8600">SPARK-8600</a> naive Bayes, <a href="https://issues.apache.org/jira/browse/SPARK-7879">SPARK-7879</a> k-means, and <a href="https://issues.apache.org/jira/browse/SPARK-8671">SPARK-8671</a> isotonic regression.</li>
  <li><strong>New Algorithms</strong>: <a href="https://issues.apache.org/jira/browse/SPARK-9471">SPARK-9471</a> multilayer perceptron classifier, <a href="https://issues.apache.org/jira/browse/SPARK-6487">SPARK-6487</a> PrefixSpan for sequential pattern mining, <a href="https://issues.apache.org/jira/browse/SPARK-8559">SPARK-8559</a> association rule generation, <a href="https://issues.apache.org/jira/browse/SPARK-8598">SPARK-8598</a> 1-sample Kolmogorov-Smirnov test, etc.</li>
  <li><strong>Improvements to existing algorithms</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-5572"><strong>LDA</strong></a>: online LDA performance, asymmetric doc concentration, perplexity, log-likelihood, top topics/documents, save/load, etc.</li>
      <li><strong>Trees and ensembles</strong>: class probabilities, feature importance for random forests, thresholds for classification, checkpointing for GBTs, etc.</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-9436"><strong>Pregel-API</strong></a>: more efficient Pregel API implementation for GraphX.</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-5016"><strong>GMM</strong></a>: distribute matrix inversions.</li>
    </ul>
  </li>
  <li><strong>Model summary</strong> for <a href="https://issues.apache.org/jira/browse/SPARK-8539">linear</a> and <a href="https://issues.apache.org/jira/browse/SPARK-9112">logistic regression</a>.</li>
  <li><strong>Python API</strong>: <a href="https://issues.apache.org/jira/browse/SPARK-5572">distributed matrices</a>, <a href="https://issues.apache.org/jira/browse/SPARK-5572">streaming k-means</a> and <a href="https://issues.apache.org/jira/browse/SPARK-5572">linear models</a>, <a href="https://issues.apache.org/jira/browse/SPARK-5572">LDA</a>, <a href="https://issues.apache.org/jira/browse/SPARK-5572">power iteration clustering</a>, etc.</li>
  <li><strong>Tuning and evaluation</strong>: <a href="https://issues.apache.org/jira/browse/SPARK-8484">train-validation split</a> and <a href="https://issues.apache.org/jira/browse/SPARK-7690">multiclass classification evaluator</a>.</li>
  <li><strong>Documentation</strong>: document the release version of public API methods</li>
</ul>

<h3 id="spark-streaming">Spark Streaming</h3>

<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-7398">SPARK-7398</a>: <strong>Backpressure</strong>: Automatic and dynamic rate controlling in Spark Streaming for handling bursty input streams. This allows a streaming pipeline to dynamically adapt to changes in ingestion rates and computation loads. This works with receivers, as well as, the Direct Kafka approach.</li>
  <li><strong>Python API for streaming sources</strong>
    <ul>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8389">SPARK-8389</a>: Kafka offsets of Direct Kafka streams available through Python API</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8564">SPARK-8564</a>: Kinesis Python API</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-8378">SPARK-8378</a>: Flume Python API</li>
      <li><a href="https://issues.apache.org/jira/browse/SPARK-5155">SPARK-5155</a>: MQTT Python API</li>
    </ul>
  </li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-3258">SPARK-3258</a>: <strong>Python API for streaming machine learning algorithms</strong>: K-Means, linear regression, and logistic regression</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9215">SPARK-9215</a>: <strong>Improved reliability of Kinesis streams</strong> : No need for enabling write ahead logs for saving and recovering received data across driver failures</li>
  <li><strong>Direct Kafka API graduated</strong>: Not experimental any more.</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8701">SPARK-8701</a>: <strong>Input metadata in UI</strong>: Kafka offsets, and input files are visible in the batch details UI</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8882">SPARK-8882</a>: Better load balancing and scheduling of receivers across cluster</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-4072">SPARK-4072</a>: Include streaming storage in web UI</li>
</ul>

<h3 id="deprecations-removals-configs-and-behavior-changes">Deprecations, Removals, Configs, and Behavior Changes</h3>

<h4 id="spark-core">Spark Core</h4>
<ul>
  <li>DAGScheduler’s local task execution mode has been removed</li>
  <li>Default driver and executor memory increased from 512m to 1g</li>
  <li>Default setting of JVM’s MaxPermSize increased from 128m to 256m</li>
  <li>Default logging level of spark-shell changed from INFO to WARN</li>
  <li>NIO-based ConnectionManager is deprecated, and will be removed in 1.6</li>
</ul>

<h4 id="spark-sql--dataframes">Spark SQL &amp; DataFrames</h4>
<ul>
  <li>Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with code generation for expression evaluation. These features can both be disabled by setting spark.sql.tungsten.enabled to false.</li>
  <li>Parquet schema merging is no longer enabled by default. It can be re-enabled by setting spark.sql.parquet.mergeSchema to true.</li>
  <li>Resolution of strings to columns in Python now supports using dots (.) to qualify the column or access nested values. For example df[&#8216;table.column.nestedField&#8217;]. However, this means that if your column name contains any dots you must now escape them using backticks (e.g., <code>table.`column.with.dots`.nested</code>).</li>
  <li>In-memory columnar storage partition pruning is on by default. It can be disabled by setting spark.sql.inMemoryColumnarStorage.partitionPruning to false.</li>
  <li>Unlimited precision decimal columns are no longer supported, instead Spark SQL enforces a maximum precision of 38. When inferring schema from BigDecimal objects, a precision of (38, 18) is now used. When no precision is specified in DDL then the default remains Decimal(10, 0).</li>
  <li>Timestamps are now processed at a precision of 1us, rather than 100ns.</li>
  <li>Sum function returns null when all input values are nulls (null before 1.4, 0 in 1.4).</li>
  <li>In the sql dialect, floating point numbers are now parsed as decimal. HiveQL parsing remains unchanged.</li>
  <li>The canonical name of SQL/DataFrame functions are now lower case (e.g. sum vs SUM).</li>
  <li>It has been determined that using the DirectOutputCommitter when speculation is enabled is unsafe and thus this output committer will not be used by parquet when speculation is on, independent of configuration.</li>
  <li>JSON data source will not automatically load new files that are created by other applications (i.e. files that are not inserted to the dataset through Spark SQL). For a JSON persistent table (i.e. the metadata of the table is stored in Hive Metastore), users can use REFRESH TABLE SQL command or HiveContext&#8217;s refreshTable method to include those new files to the table. For a DataFrame representing a JSON dataset, users need to recreate the DataFrame and the new DataFrame will include new files.</li>
</ul>

<h4 id="spark-streaming-1">Spark Streaming</h4>
<ul>
  <li>New experimental backpressure feature can be enabled by setting the configuration spark.streaming.backpressure.enabled to true.</li>
  <li>Write Ahead Log does not need to be abled for Kinesis streams. The updated Kinesis receiver keeps track of Kinesis sequence numbers received in each batch, and uses that information re-read the necessary data while recovering from failures.</li>
  <li>The number of times the receivers are relaunched on failure are not limited by the max Spark task attempts. The system will always try to relaunch receivers after failures until the StreamingContext is stopped.</li>
  <li>Improved load balancing of receivers across the executors, even after relaunching.</li>
  <li>Enabling checkpointing when using queueStream throws exception as queueStream cannot be checkpointed. However, we found this to break certain existing apps. So this change will be reverted in Spark 1.5.1.</li>
</ul>

<h4 id="mllib">MLlib</h4>

<p>In the spark.mllib package, there are no breaking API changes but some behavior changes:</p>

<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9005">SPARK-9005</a>: RegressionMetrics.explainedVariance returns the average regression sum of squares.</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-8600">SPARK-8600</a>: NaiveBayesModel.labels become sorted.</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-3382">SPARK-3382</a>: GradientDescent has a default convergence tolerance 1e-3, and hence iterations might end earlier than 1.4.</li>
</ul>

<p>In the experimental spark.ml package, there exists one breaking API change and one behavior change:</p>

<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-9268">SPARK-9268</a>: Java&#8217;s varargs support is removed from Params.setDefault due to a Scala compiler bug.</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10097">SPARK-10097</a>: Evaluator.isLargerBetter is added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4.</li>
</ul>

<h3 id="known-issues">Known Issues</h3>

<p>The following issues are known in 1.5.0, and will be fixed in 1.5.1 release.</p>

<h4 id="sqldataframe">SQL/DataFrame</h4>

<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10301">SPARK-10301</a>: Reading parquet files with different schema (schema merging) for nested structs can return the wrong answer</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10466">SPARK-10466</a>: AssertionError when spilling data during sort-based shuffle with data spill</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10441">SPARK-10441</a>: Timestamp data type cannot be written out as JSON</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10495">SPARK-10495</a>: Date values saved to JSON are stored as strings representing the number of days from epoch (1970-01-01 00:00:00 UTC) instead of strings in the format of &#8220;yyyy-mm-dd&#8221;.</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10403">SPARK-10403</a>: Tungsten mode does not work with tungsten-sort shuffle manager (which is off by default)</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10422">SPARK-10422</a>: In-memory cache of string type with dictionary encoding is broken</li>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10434">SPARK-10434</a> Parquet files with null elements in arrays written by Spark 1.5.0 cannot be read by earlier versions of Spark</li>
</ul>

<h4 id="streaming">Streaming</h4>
<ul>
  <li><a href="https://issues.apache.org/jira/browse/SPARK-10224">SPARK-10224</a> Small chance of data loss when StreamingContext is stopped gracefully</li>
</ul>

<h3 id="credits">Credits</h3>

<p>We would like to thank the following organizations for testing the release candidates with their workloads: Mesosphere, Typesafe, Tencent, Palantir, Cloudera, Hortonworks, Huawei, Shopify, Netflix, Intel, Yahoo, Kixer, UC Berkeley and Databricks.</p>

<p>Last but not least, this release would not have been possible without the following contributors: Aaron Davidson, Adam Roberts, Ai He, Akshat Aranya, Alex Shkurenko, Alex Slusarenko, Alexander Ulanov, Alok Singh, Amey Chaugule, Andrew Or, Andrew Ray, Animesh Baranawal, Ankur Chauhan, Ankur Dave, Ben Fradet, Bert Greevenbosch, Bimal Tandel, Brennan Ashton, Brennon York, Brian Lockwood, Bryan Cutler, Burak Yavuz, Calvin Jia, Carl Anders Duvel, Carson Wang, Chen Xu, Cheng Hao, Cheng Lian, Cheolsoo Park, Chris Freeman, Christian Kadner, Cody Koeninger, Damian Guy, Daniel Darabos, Daniel Emaasit, Daoyuan Wang, Dariusz Kobylarz, David Arroyo Cazorla, Davies Liu, DB Tsai, Dennis Huo, Deron Eriksson, Devaraj K, Dibyendu Bhattacharya, Dong Wang, Emiliano Leporati, Eric Liang, Favio Vazquez, Felix Cheung, Feynman Liang, Forest Fang, Francois Garillot, Gen Tang, George Dittmar, Guo Wei, GuoQiang Li, Han JU, Hao Zhu, Hari Shreedharan, Herman Van Hovell, Holden Karau, Hossein Falaki, Huang Zhaowei, Hyukjin Kwon, Ilya Ganelin, Imran Rashid, Iulian Dragos, Jacek Lewandowski, Jacky Li, Jan Prach, Jean Lyn, Jeff Zhang, Jiajin Zhang, Jie Huang, Jihong MA, Jonathan Alter, Jose Cambronero, Joseph Batchik, Joseph Gonzalez, Joseph K. Bradley, Josh Rosen, Judy Nash, Juhong Park, Kai Sasaki, Kai Zeng, KaiXinXiaoLei, Kan Zhang, Kashif Rasul, Kay Ousterhout, Keiji Yoshida, Kenichi Maehashi, Keuntae Park, Kevin Conor, Konstantin Shaposhnikov, Kousuke Saruta, Kun Xu, Lars Francke, Leah McGuire, lee19, Liang-Chi Hsieh, Lianhui Wang, Luca Martinetti, Luciano Resende, Manoj Kumar, Marcelo Vanzin, Mark Smith, Martin Zapletal, Matei Zaharia, Mateusz Buskiewicz, Matt Massie, Matthew Brandyberry, Meethu Mathew, Meihua Wu, Michael Allman, Michael Armbrust, Michael Davies, Michael Sannella, Michael Vogiatzis, Michel Lemay, Mike Dusenberry, Min Zhou, Mingfei Shi, mosessky, Moussa Taifi, Mridul Muralidharan, NamelessAnalyst, Namit Katariya, Nan Zhu, Nathan Howell, Navis Ryu, Neelesh Srinivas Salian, Nicholas Chammas, Nicholas Hwang, Nilanjan Raychaudhuri, Niranjan Padmanabhan, Nishkam Ravi, Nishkam Ravi, Noel Smith, Oleksiy Dyagilev, Oleksiy Dyagilev, Paavo Parkkinen, Patrick Baier, Patrick Wendell, Pawel Kozikowski, Pedro Rodriguez, Perinkulam I. Ganesh, Piotr Migdal, Prabeesh K, Pradeep Chhetri, Prayag Chandran, Punya Biswal, Qian Huang, Radek Ostrowski, Rahul Palamuttam, Ram Sriharsha, Rekha Joshi, Rekha Joshi, Rene Treffer, Reynold Xin, Roger Menezes, Rohit Agarwal, Rosstin Murphy, Rowan Chattaway, Ryan Williams, Saisai Shao, Sameer Abhyankar, Sandy Ryza, Santiago M. Mola, Scott Taylor, Sean Owen, Sephiroth Lin, Seth Hendrickson, Sheng Li, Shilei Qian, Shivaram Venkataraman, Shixiong Zhu, Shuo Bai, Shuo Xiang, Simon Hafner, Spiro Michaylov, Stan Zhai, Stefano Parmesan, Steve Lindemann, Steve Loughran, Steven She, Su Yan, Sudhakar Thota, Sun Rui, Takeshi YAMAMURO, Takuya Ueshin, Tao Li, Tarek Auel, Tathagata Das, Ted Blackman, Ted Yu, Thomas Omans, Thomas Szymanski, Tien-Dung Le, Tijo Thomas, Tim Ellison, Timothy Chen, Tom Graves, Tom White, Tomohiko K., Vincent D. Warmerdam, Vinod K C, Vinod KC, Vladimir Vladimirov, Vyacheslav Baranov, Wang Tao, Wang Wei, Weizhong Lin, Wenchen Fan, Wisely Chen, Xiangrui Meng, Xu Tingjun, Xusen Yin, Yadong Qi, Yanbo Liang, Yash Datta, Yijie Shen, Yin Huai, Yong Tang, Yu ISHIKAWA, Yuhao Yang, Yuming Wang, Yuri Saito, Yuu ISHIKAWA, Zc He, Zhang, Liye, Zhichao Li, Zhongshuai Pei, Zoltan Zvara, and a few unknown contributors (please indicate your email and name in your git commit to show up here).</p>


<p>
<br/>
<a href="/news/">Spark News Archive</a>
</p>

  </div>
</div>



<footer class="small">
  <hr>
  Apache Spark, Spark, Apache, and the Spark logo are <a href="https://www.apache.org/foundation/marks/">trademarks</a> of
  <a href="http://www.apache.org">The Apache Software Foundation</a>.
</footer>

</div>

</body>
</html>