summaryrefslogtreecommitdiff
path: root/site/releases/spark-release-0-7-0.html
blob: 580e8fc7352007cc0aef54300376b96f82b00a83 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">

  <title>
     Spark Release 0.7.0 | Apache Spark
    
  </title>

  

  

  <!-- Bootstrap core CSS -->
  <link href="/css/cerulean.min.css" rel="stylesheet">
  <link href="/css/custom.css" rel="stylesheet">

  <!-- Code highlighter CSS -->
  <link href="/css/pygments-default.css" rel="stylesheet">

  <script type="text/javascript">
  <!-- Google Analytics initialization -->
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-32518208-2']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();

  <!-- Adds slight delay to links to allow async reporting -->
  function trackOutboundLink(link, category, action) {
    try {
      _gaq.push(['_trackEvent', category , action]);
    } catch(err){}

    setTimeout(function() {
      document.location.href = link.href;
    }, 100);
  }
  </script>

  <!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
  <!--[if lt IE 9]>
  <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
  <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
  <![endif]-->
</head>

<body>

<script src="https://code.jquery.com/jquery.js"></script>
<script src="//netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script>
<script src="/js/lang-tabs.js"></script>
<script src="/js/downloads.js"></script>

<div class="container" style="max-width: 1200px;">

<div class="masthead">
  
    <p class="lead">
      <a href="/">
      <img src="/images/spark-logo-trademark.png"
        style="height:100px; width:auto; vertical-align: bottom; margin-top: 20px;"></a><span class="tagline">
          Lightning-fast cluster computing
      </span>
    </p>
  
</div>

<nav class="navbar navbar-default" role="navigation">
  <!-- Brand and toggle get grouped for better mobile display -->
  <div class="navbar-header">
    <button type="button" class="navbar-toggle" data-toggle="collapse"
            data-target="#navbar-collapse-1">
      <span class="sr-only">Toggle navigation</span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
    </button>
  </div>

  <!-- Collect the nav links, forms, and other content for toggling -->
  <div class="collapse navbar-collapse" id="navbar-collapse-1">
    <ul class="nav navbar-nav">
      <li><a href="/downloads.html">Download</a></li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Libraries <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/sql/">SQL and DataFrames</a></li>
          <li><a href="/streaming/">Spark Streaming</a></li>
          <li><a href="/mllib/">MLlib (machine learning)</a></li>
          <li><a href="/graphx/">GraphX (graph)</a></li>
          <li class="divider"></li>
          <li><a href="http://spark-packages.org">Third-Party Packages</a></li>
        </ul>
      </li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Documentation <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/docs/latest/">Latest Release (Spark 2.0.0)</a></li>
          <li><a href="/documentation.html">Older Versions and Other Resources</a></li>
        </ul>
      </li>
      <li><a href="/examples.html">Examples</a></li>
      <li class="dropdown">
        <a href="/community.html" class="dropdown-toggle" data-toggle="dropdown">
          Community <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/community.html">Mailing Lists</a></li>
          <li><a href="/community.html#events">Events and Meetups</a></li>
          <li><a href="/community.html#history">Project History</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Powered+By+Spark">Powered By</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Committers">Project Committers</a></li>
          <li><a href="https://issues.apache.org/jira/browse/SPARK">Issue Tracker</a></li>
        </ul>
      </li>
      <li><a href="/faq.html">FAQ</a></li>
    </ul>
    <ul class="nav navbar-nav navbar-right">
      <li class="dropdown">
        <a href="http://www.apache.org/" class="dropdown-toggle" data-toggle="dropdown">
          Apache Software Foundation <b class="caret"></b></a>
        <ul class="dropdown-menu">
          <li><a href="http://www.apache.org/">Apache Homepage</a></li>
          <li><a href="http://www.apache.org/licenses/">License</a></li>
          <li><a href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
          <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
          <li><a href="http://www.apache.org/security/">Security</a></li>
        </ul>
      </li>
    </ul>
  </div>
  <!-- /.navbar-collapse -->
</nav>


<div class="row">
  <div class="col-md-3 col-md-push-9">
    <div class="news" style="margin-bottom: 20px;">
      <h5>Latest News</h5>
      <ul class="list-unstyled">
        
          <li><a href="/news/spark-2-0-0-released.html">Spark 2.0.0 released</a>
          <span class="small">(Jul 26, 2016)</span></li>
        
          <li><a href="/news/spark-1-6-2-released.html">Spark 1.6.2 released</a>
          <span class="small">(Jun 25, 2016)</span></li>
        
          <li><a href="/news/submit-talks-to-spark-summit-eu-2016.html">Call for Presentations for Spark Summit EU is Open</a>
          <span class="small">(Jun 16, 2016)</span></li>
        
          <li><a href="/news/spark-2.0.0-preview.html">Preview release of Spark 2.0</a>
          <span class="small">(May 26, 2016)</span></li>
        
      </ul>
      <p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
    </div>
    <div class="hidden-xs hidden-sm">
      <a href="/downloads.html" class="btn btn-success btn-lg btn-block" style="margin-bottom: 30px;">
        Download Spark
      </a>
      <p style="font-size: 16px; font-weight: 500; color: #555;">
        Built-in Libraries:
      </p>
      <ul class="list-none">
        <li><a href="/sql/">SQL and DataFrames</a></li>
        <li><a href="/streaming/">Spark Streaming</a></li>
        <li><a href="/mllib/">MLlib (machine learning)</a></li>
        <li><a href="/graphx/">GraphX (graph)</a></li>
      </ul>
      <a href="http://spark-packages.org">Third-Party Packages</a>
    </div>
  </div>

  <div class="col-md-9 col-md-pull-3">
    <h2>Spark Release 0.7.0</h2>


<p>The Spark team is proud to release version 0.7.0, a new major release that brings several new features. Most notable are a <a href="/docs/0.7.0/python-programming-guide.html">Python API for Spark</a> and an <a href="/docs/0.7.0/streaming-programming-guide.html">alpha of Spark Streaming</a>. (Details on Spark Streaming can also be found in this <a href="http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf">technical report</a>.) The release also adds numerous other improvements across the board. Overall, this is our biggest release to date, with 31 contributors, of which 20 were external to Berkeley.</p>

<p>You can download Spark 0.7.0 as either a <a href="http://spark-project.org/download/spark-0.7.0-sources.tgz">source package</a> (4 MB tar.gz) or <a href="http://spark-project.org/download-spark-0.7.0-prebuilt-tgz">prebuilt package</a> (60 MB tar.gz).</p>

<h3>Python API</h3>

<p>Spark 0.7 adds a <a href="/docs/0.7.0/python-programming-guide.html">Python API</a> called PySpark that makes it possible to use Spark from Python, both in standalone programs and in interactive Python shells. It uses the standard CPython runtime, so your programs can call into native libraries like NumPy and SciPy. Like the Scala and Java APIs, PySpark will automatically ship functions from your main program, along with the variables they depend on, to the cluster. PySpark supports most Spark features, including RDDs, accumulators, broadcast variables, and HDFS input and output.</p>

<h3>Spark Streaming Alpha</h3>

<p>Spark Streaming is a new extension of Spark that adds near-real-time processing capability. It offers a simple and high-level API, where users can transform streams using parallel operations like <tt>map</tt>, <tt>filter</tt>, <tt>reduce</tt>, and new sliding window functions. It automatically distributes work over a cluster and provides efficient fault recovery with exactly-once semantics for transformations, without relying on costly transactions to an external system. Spark Streaming is described in more detail in <a href="/talks/strata_spark_streaming.ppt">these slides</a> and <a href="http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf">our technical report</a>. This release is our first alpha of Spark Streaming, with most of the functionality implemented and APIs in Java and Scala.</p>

<h3>Memory Dashboard</h3>

<p>Spark jobs now launch a web dashboard for monitoring the memory usage of each distributed dataset (RDD) in the program. Look for lines like this in your log:</p>

<p><tt>15:08:44 INFO BlockManagerUI: Started BlockManager web UI at http://mbk.local:63814</tt></p>

<p>You can also control which port to use through the <tt>spark.ui.port</tt> property.</p>

<h3>Maven Build</h3>

<p>Spark can now be built using Maven in addition to SBT. The Maven build enables easier publishing to repositories of your choice, easy selection of Hadoop versions using the Maven profile (<tt>-Phadoop1</tt> or <tt>-Phadoop2</tt>), as well as Debian packaging using <tt>mvn -Phadoop1,deb install</tt>.</p>

<h3>New Operations</h3>

<p>This release adds several RDD transformations, including <tt>keys</tt>, <tt>values</tt>, <tt>keyBy</tt>, <tt>subtract</tt>, <tt>coalesce</tt>, <tt>zip</tt>. It also adds <tt>SparkContext.hadoopConfiguration</tt> to allow programs to configure Hadoop input/output settings globally across operations. Finally, it adds the <tt>RDD.toDebugString()</tt> method, which can be used to print an RDD&#8217;s lineage graph for troubleshooting.</p>

<h3>EC2 Improvements</h3>

<ul>
  <li>Spark will now read S3 credentials from the <tt>AWS_ACCESS_KEY_ID</tt> and <tt>AWS_SECRET_ACCESS_KEY</tt> environment variables, if set, making it easier to access Amazon S3.</li>
  <li>This release fixes a bug with S3 access that would leave streams open when they are not fully read (e.g. when calling <tt>RDD.first()</tt> or a SQL query with a limit), causing nodes to hang.</li>
  <li>The EC2 scripts now support both standalone and Mesos clusters, and launch Ganglia on the cluster.</li>
  <li>Spark EC2 clusters can now be spread across multiple availability zones.</li>
</ul>

<h3>Other Improvements</h3>

<ul>
  <li>Shuffle operations like <tt>groupByKey</tt> and <tt>reduceByKey</tt> now try to infer parallelism from the size of the parent RDD (unless <tt>spark.default.parallelism</tt> is set).</li>
  <li>Several performance improvements to shuffles.</li>
  <li>Standalone deploy cluster now spreads jobs out across machines by default, leading to better data locality.</li>
  <li>Better error reporting when jobs aren't being launched due to not enough resources.</li>
  <li>Standalone deploy web UI now includes JSON endpoints for querying cluster state.</li>
  <li>Better support for IBM JVM.</li>
  <li>Default Hadoop version dependency updated to 1.0.4.</li>
  <li>Improved failure handling and reporting of error messages.</li>
  <li>Separate configuration for standalone cluster daemons and user applications.</li>
  <li>Significant refactoring of the scheduler codebase to enable richer unit testing.</li>
  <li>Several bug and performance fixes throughout.</li>
</ul>

<h3>Compatibility</h3>

<p>This release is API-compatible with Spark 0.6 programs, but the following features changed slightly:</p>
<ul>
  <li>Parallel shuffle operations where you don't specify a level of parallelism use the number of partitions of the parent RDD instead of a constant default. However, if you set <tt>spark.default.parallelism</tt>, they will use that.</li>
  <li><tt>SparkContext.addFile</tt>, which distributes a file to worker nodes, is no longer guaranteed to put it in the executor's working directory---instead, you can find the directory it used using <tt>SparkFiles.getRootDirectory</tt>, or get a particular file using <tt>SparkFiles.get</tt>. This was done to avoid cluttering the local directory when running in local mode.</li>
</ul>

<h3>Credits</h3>

<p>Spark 0.7 was the work of many contributors from Berkeley and outside&#8212;in total, 31 different contributors, of which 20 were from outside Berkeley. Here are the people who contributed, along with areas they worked on:</p>

<ul>
  <li>Mikhail Bautin -- Maven build</li>
  <li>Denny Britz -- memory dashboard, streaming, bug fixes</li>
  <li>Paul Cavallaro -- error reporting</li>
  <li>Tathagata Das -- streaming (lead developer), 24/7 operation, bug fixes, docs</li>
  <li>Thomas Dudziak -- Maven build, Hadoop 2 support</li>
  <li>Harvey Feng -- bug fix</li>
  <li>Stephen Haberman -- new RDD operations, configuration, S3 improvements, code cleanup, bug fixes</li>
  <li>Tyson Hamilton -- JSON status endpoints</li>
  <li>Mark Hamstra -- API improvements, docs</li>
  <li>Michael Heuer -- docs</li>
  <li>Shane Huang -- shuffle performance fixes</li>
  <li>Andy Konwinski -- docs</li>
  <li>Ryan LeCompte -- streaming</li>
  <li>Haoyuan Li -- streaming</li>
  <li>Richard McKinley -- build</li>
  <li>Sean McNamara -- streaming</li>
  <li>Lee Moon Soo -- bug fix</li>
  <li>Fernand Pajot -- bug fix</li>
  <li>Nick Pentreath -- Python API, examples</li>
  <li>Andrew Psaltis -- bug fixes</li>
  <li>Imran Rashid -- memory dashboard, bug fixes</li>
  <li>Charles Reiss -- fault recovery fixes, code cleanup, testability, error reporting</li>
  <li>Josh Rosen -- Python API (lead developer), EC2 scripts, bug fixes</li>
  <li>Peter Sankauskas -- EC2 scripts</li>
  <li>Prashant Sharma -- streaming</li>
  <li>Shivaram Venkataraman -- EC2 scripts, optimizations</li>
  <li>Patrick Wendell -- streaming, bug fixes, examples, docs</li>
  <li>Reynold Xin -- optimizations, UI</li>
  <li>Haitao Yao -- run scripts</li>
  <li>Matei Zaharia -- streaming, fault recovery, Python API, code cleanup, bug fixes, docs</li>
  <li>Eric Zhang -- examples</li>
</ul>

<p>Thanks to everyone who contributed!</p>


<p>
<br/>
<a href="/news/">Spark News Archive</a>
</p>

  </div>
</div>



<footer class="small">
  <hr>
  Apache Spark, Spark, Apache, and the Spark logo are <a href="/trademarks.html">trademarks</a> of
  <a href="http://www.apache.org">The Apache Software Foundation</a>.
</footer>

</div>

</body>
</html>