summaryrefslogtreecommitdiff
path: root/site/index.html
blob: b773efdca1f86d97aa9458bd5a963df83c502dd1 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
<!DOCTYPE html>
<!--[if IE 6]>
<html id="ie6" dir="ltr" lang="en-US">
<![endif]-->
<!--[if IE 7]>
<html id="ie7" dir="ltr" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html id="ie8" dir="ltr" lang="en-US">
<![endif]-->
<!--[if !(IE 6) | !(IE 7) | !(IE 8)  ]><!-->
<html dir="ltr" lang="en-US">
<!--<![endif]-->
<head>
  <link rel="shortcut icon" href="/favicon.ico" />
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width" />
  <title>
     Apache Spark - Lightning-Fast Cluster Computing
    
  </title>

  <link rel="stylesheet" type="text/css" media="all" href="/css/style.css" />
  <link rel="stylesheet" href="/css/pygments-default.css">

  <script>
    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
        m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-32518208-2', 'apache.org');
    ga('send', 'pageview');

  </script>

  <link rel='canonical' href='/index.html' />

  <style type="text/css">
    #site-title,
    #site-description {
      position: absolute !important;
      clip: rect(1px 1px 1px 1px); /* IE6, IE7 */
      clip: rect(1px, 1px, 1px, 1px);
    }
  </style>
  <style type="text/css" id="custom-background-css">
    body.custom-background { background-color: #f1f1f1; }
  </style>
</head>

<!--body class="page two-column right-sidebar"-->
<body class="page">
<div id="page" class="hfeed">

  <header id="branding" role="banner">
  <hgroup>
    <h1 id="site-title"><span><a href="/" title="Spark" rel="home">Spark</a></span></h1>
    <h2 id="site-description">Lightning-Fast Cluster Computing</h2>
  </hgroup>

  <a href="/">
    <img src="/images/spark-project-header1.png" width="1000" height="220" alt="Spark: Lightning-Fast Cluster Computing" title="Spark: Lightning-Fast Cluster Computing" />
  </a>

  <nav id="access" role="navigation">
    <h3 class="assistive-text">Main menu</h3>
    <div class="menu-main-menu-container">
      <ul id="menu-main-menu" class="menu">
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page current-menu-item">
          <a href="/index.html">Home</a>
        </li>
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page ">
          <a href="/downloads.html">Downloads</a>
        </li>
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page ">
          <a href="/documentation.html">Documentation</a>
        </li>
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page ">
          <a href="/examples.html">Examples</a>
        </li>
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page ">
          <a href="/mailing-lists.html">Mailing Lists</a>
        </li>
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page ">
          <a href="/research.html">Research</a>
        </li>
        
        <li class="menu-item menu-item-type-post_type menu-item-object-page ">
          <a href="/faq.html">FAQ</a>
        </li>
        
      </ul></div>
  </nav><!-- #access -->
</header><!-- #branding -->



  <div id="main">
    <div id="primary">
      <div id="content" role="main">
        
          <article class="page type-page status-publish hentry">
            <h2 id="what-is-apache-spark">What is Apache Spark?</h2>

<p>Apache Spark is an open source cluster computing system that aims to make data analytics <em>fast</em> — both fast to run and fast to write.</p>

<p>To run programs faster, Spark offers a general execution model that can optimize arbitrary operator graphs, and supports in-memory computing, which lets it query data faster than disk-based engines like Hadoop.</p>

<p>To make programming faster, Spark provides clean, concise APIs in
<a href="/docs/latest/quick-start.html#a-standalone-job-in-python">Python</a>,
<a href="http://www.scala-lang.org" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.scala-lang.org']);">Scala</a> and
<a href="/docs/latest/quick-start.html#a-standalone-job-in-java">Java</a>.
You can also use Spark interactively from the Scala and Python shells to rapidly query big datasets.</p>

<h2 id="what-can-it-do">What can it do?</h2>

<p>Spark was initially developed for two  applications where placing data in memory helps: <em>iterative</em> algorithms, which are common in machine learning, and <em>interactive</em> data mining. In both cases, Spark can run up to <b>100x</b> faster than Hadoop MapReduce. However, you can use Spark for general data processing too. Check out our <a href="/examples.html">example jobs</a>.</p>

<p>Spark is also the engine behind <a href="http://shark.cs.berkeley.edu" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://shark.cs.berkeley.edu']);">Shark</a>, a fully <a href="http://hive.apache.org" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://hive.apache.org']);">Apache Hive</a>-compatible data warehousing system that can run 100x faster than Hive.</p>

<p>While Spark is a new engine, it can access any data source supported by Hadoop, making it easy to run over existing data.</p>

<h2 id="who-uses-it">Who uses it?</h2>
<p>Spark was initially developed in the <a href="https://amplab.cs.berkeley.edu" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://amplab.cs.berkeley.edu']);">UC Berkeley AMPLab</a>, but is now being used and developed at a wide array of companies, including <a href="http://www.yahoo.com" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.yahoo.com']);">Yahoo!</a>, <a href="http://www.conviva.com" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.conviva.com']);">Conviva</a>, and <a href="http://www.quantifind.com" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://www.quantifind.com']);">Quantifind</a>.
In total, over 20 companies have contributed code to Spark.
Spark is <a href="https://github.com/mesos/spark" onclick="javascript:_gaq.push(['_trackEvent','outbound-article','http://github.com']);">open source</a> under an Apache license, so <a href="/downloads.html">download</a> it to check it out.</p>

<h2 id="apache-incubator-notice">Apache Incubator notice</h2>
<p>Apache Spark is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.</p>


          </article><!-- #post -->
        
      </div><!-- #content -->
      
      <div id="secondary" class="widget-area" role="complementary">
        
<h3 class="widget-title">Latest News</h3>
<div class="latestnewswidget">
    
      <div><a href="/news/spark-0-8-0-released.html">Spark 0.8.0 released</a> <span class="post-info">(September 25, 2013)</span></div>
    
      <div><a href="/news/spark-user-survey-and-powered-by-page.html">Spark user survey and "Powered By" page</a> <span class="post-info">(September 05, 2013)</span></div>
    
      <div><a href="/news/fourth-spark-screencast-published.html">Fourth Spark screencast released</a> <span class="post-info">(August 27, 2013)</span></div>
    
      <div><a href="/news/amp-camp-2013-registration-ope.html">Registration open for AMP Camp training camp in Berkeley</a> <span class="post-info">(July 23, 2013)</span></div>
    
  </div>

<div style="text-align:right"><a href="/news/index.html">News Archive</a></div>

<p><!-- Not porting the following to Pygments since it becomes a lot less colorful --></p>

<div class="code" style="margin-top: 20px;">
    file = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
    &nbsp;<br />
    file.<span class="sparkop">flatMap</span>(<span class="closure">line =&gt; line.split(" ")</span>)<br />
    &nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">map</span>(<span class="closure">word =&gt; (word, 1)</span>)<br />
    &nbsp;&nbsp;&nbsp;&nbsp;.<span class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)
  </div>
<div class="caption">Word Count implemented in Spark</div>

<div align="center" style="margin-top: 20px;">
    <img src="/images/spark-lr.png" alt="Logistic regression performance in Spark vs Hadoop" />
  </div>
<div class="caption">Logistic regression in Spark vs Hadoop</div>
<h2 style="text-align:center"><a href="/downloads"><img src="/images/download.png" alt="Download" style="vertical-align: middle" />&nbsp;&nbsp;Download Spark</a></h2>

      </div>
      
      <footer id="colophon" role="contentinfo">
  <div id="site-generator">
    <p style="padding-top: 0; padding-bottom: 15px;">
      Apache Spark is an effort undergoing incubation at The Apache Software Foundation.
      <a href="http://incubator.apache.org/" style="border: none;">
        <img style="vertical-align: middle; border: none;" src="/images/incubator-logo.png" alt="Apache Incubator" title="Apache Incubator" />
      </a>  
    </p>
  </div>
</footer><!-- #colophon -->

    </div><!-- #primary -->
  </div><!-- #main -->
</div><!-- #page -->


</body>
</html>