summaryrefslogtreecommitdiff
path: root/site/faq.html
blob: fa3a7513902c221cb397a21bb671b27d2f79ff12 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">

  <title>
     FAQ | Apache Spark
    
  </title>

  

  <!-- Bootstrap core CSS -->
  <link href="/css/cerulean.min.css" rel="stylesheet">
  <link href="/css/custom.css" rel="stylesheet">

  <script type="text/javascript">
  <!-- Google Analytics initialization -->
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-32518208-2']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();

  <!-- Adds slight delay to links to allow async reporting -->
  function trackOutboundLink(link, category, action) {  
    try { 
      _gaq.push(['_trackEvent', category , action]); 
    } catch(err){}
 
    setTimeout(function() {
      document.location.href = link.href;
    }, 100);
  }
  </script>

  <!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
  <!--[if lt IE 9]>
  <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
  <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
  <![endif]-->
</head>

<body>

<div class="container" style="max-width: 1200px;">

<div class="masthead">
  
    <p class="lead">
      <a href="/">
      <img src="/images/spark-logo.png"
        style="height:100px; width:auto; vertical-align: bottom; margin-top: 20px;"></a><span class="tagline">
          Lightning-fast cluster computing
      </span>
    </p>
  
</div>

<nav class="navbar navbar-default" role="navigation">
  <!-- Brand and toggle get grouped for better mobile display -->
  <div class="navbar-header">
    <button type="button" class="navbar-toggle" data-toggle="collapse"
            data-target="#navbar-collapse-1">
      <span class="sr-only">Toggle navigation</span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
    </button>
  </div>

  <!-- Collect the nav links, forms, and other content for toggling -->
  <div class="collapse navbar-collapse" id="navbar-collapse-1">
    <ul class="nav navbar-nav">
      <li><a href="/downloads.html">Download</a></li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Related Projects <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          
          <li><a href="http://shark.cs.berkeley.edu">Shark (SQL)</a></li>
          <li><a href="/streaming/">Spark Streaming</a></li>
          <li><a href="/mllib/">MLlib (machine learning)</a></li>
          <li><a href="http://amplab.github.io/graphx/">GraphX (graph)</a></li>
        </ul>
      </li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Documentation <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/documentation.html">Overview</a></li>
          <li><a href="/docs/latest/">Latest Release</a></li>
          <li><a href="/examples.html">Examples</a></li>
        </ul>
      </li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Community <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/community.html">Mailing Lists</a></li>
          <li><a href="/community.html#events">Events and Meetups</a></li>
          <li><a href="/community.html#history">Project History</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Powered+By+Spark">Powered By</a></li>
        </ul>
      </li>
      <li><a href="/faq.html">FAQ</a></li>
    </ul>
  </div>
  <!-- /.navbar-collapse -->
</nav>


<div class="row">
  <div class="col-md-3 col-md-push-9">
    <div class="news" style="margin-bottom: 20px;">
      <h5>Latest News</h5>
      <ul class="list-unstyled">
        
          <li><a href="/news/spark-0-9-0-released.html">Spark 0.9.0 released</a>
          <span class="small">(Feb 02, 2014)</span></li>
        
          <li><a href="/news/spark-0-8-1-released.html">Spark 0.8.1 released</a>
          <span class="small">(Dec 19, 2013)</span></li>
        
          <li><a href="/news/spark-summit-2013-is-a-wrap.html">Spark Summit 2013 is a Wrap</a>
          <span class="small">(Dec 15, 2013)</span></li>
        
          <li><a href="/news/announcing-the-first-spark-summit.html">Announcing the first Spark Summit: December 2, 2013</a>
          <span class="small">(Oct 08, 2013)</span></li>
        
      </ul>
      <p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
    </div>
    <div class="hidden-xs hidden-sm">
      <a href="/downloads.html" class="btn btn-success btn-lg btn-block" style="margin-bottom: 30px;">
        Download Spark
      </a>
      <p style="font-size: 16px; font-weight: 500; color: #555;">
        Related Projects:
      </p>
      <ul class="list-narrow">
        
        <li><a href="http://shark.cs.berkeley.edu">Shark (SQL)</a></li>
        <li><a href="/streaming/">Spark Streaming</a></li>
        <li><a href="/mllib/">MLlib (machine learning)</a></li>
        <li><a href="http://amplab.github.io/graphx/">GraphX (graph)</a></li>
      </ul>
    </div>
  </div>

  <div class="col-md-9 col-md-pull-3">
    <h2>Spark FAQ</h2>

<p class="question">How does Spark relate to Hadoop?</p>
<p class="answer">
Spark is a fast and powerful engine for processing Hadoop data.
It runs in Hadoop clusters through
<a href="http://hadoop.apache.org/docs/current2/hadoop-yarn/hadoop-yarn-site/YARN.html">Hadoop YARN</a> 
or Spark's <a href="/docs/latest/spark-standalone.html">standalone mode</a>, and it can process
data in HDFS, HBase, Cassandra, Hive, and any Hadoop InputFormat.
It is designed to perform both general data processing (similar to MapReduce) and new workloads like
streaming, interactive queries, and machine learning.
</p>

<p class="question">Which languages does Spark support?</p>
<p class="answer">Spark supports Scala, Java and Python.</p>

<p class="question">Does Spark require modified versions of Scala or Python?</p>
<p class="answer">No. Spark requires no changes to Scala or compiler plugins. The Python API uses the standard CPython implementation, and can call into existing C libraries for Python such as NumPy.</p>

<p class="question">What happens when a cached dataset does not fit in memory?</p>
<p class="answer">Spark can either spill it to disk or recompute the partitions that don't fit in RAM each time they are requested. By default, it uses recomputation, but you can set a dataset's <a href="/docs/latest/scala-programming-guide.html#rdd-persistence">storage level</a> to <code>MEMORY_AND_DISK</code> to avoid this.  </p>

<p class="question">How can I run Spark on a cluster?</p>
<p class="answer">You can use either the <a href="/docs/latest/spark-standalone.html">standalone deploy mode</a>, which only needs Java to be installed on each node, or the <a href="/docs/latest/running-on-mesos.html">Mesos</a> and <a href="/docs/latest/running-on-yarn.html">YARN</a> cluster managers. If you'd like to run on Amazon EC2, Spark provides <a href="/docs/latest/ec2-scripts.html}}">EC2 scripts</a> to automatically launch a cluster.</p>

<p>Note that you can also run Spark locally (possibly on multiple cores) without any special setup by just passing <code>local[N]</code> as the master URL, where <code>N</code> is the number of parallel threads you want.</p>

<p class="question">Do I need Hadoop to run Spark?</p>
<p class="answer">No, but if you run on a cluster, you will need some form of shared file system (for example, NFS mounted at the same path on each node). If you have this type of filesystem, you can just deploy Spark in standalone mode.</p>

<p class="question">How can I access data in S3?</p>
<p class="answer">Use the <code>s3n://</code> URI scheme (<code>s3n://bucket/path</code>). You will also need to set your Amazon security credentials, either by setting the environment variables <code>AWS_ACCESS_KEY_ID</code> and <code>AWS_SECRET_ACCESS_KEY</code> before your program runs, or by setting <code>fs.s3.awsAccessKeyId</code> and <code>fs.s3.awsSecretAccessKey</code> in <code>SparkContext.hadoopConfiguration</code>.</p>

<p class="question">What are good resources for learning Scala?</p>
<p class="answer">Check out <a href="http://www.artima.com/scalazine/articles/steps.html">First Steps to Scala</a> for a quick introduction, the <a href="http://www.scala-lang.org/docu/files/ScalaTutorial.pdf">Scala tutorial for Java programmers</a>, or the free online book <a href="http://www.artima.com/pins1ed/">Programming in Scala</a>. Scala is easy to transition to if you have Java experience or experience in a similarly high-level language (e.g. Ruby).</p>

<p>In addition, Spark also has <a href="/docs/latest/java-programming-guide.html">Java</a> and <a href="/docs/latest/python-programming-guide.html">Python</a> APIs.</p>

<p class="question">What license is Spark under?</p>

<p class="answer">Starting in version 0.8, Spark is under the <a href="http://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 license</a>. Previous versions used the <a href="https://github.com/mesos/spark/blob/branch-0.7/LICENSE">BSD license</a>.</p>

<p class="question">How can I contribute to Spark?</p>
<p class="answer">Contact the <a href="/community.html">mailing list</a> or send us a pull request on <a href="https://github.com/apache/incubator-spark">GitHub</a> (instructions <a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">here</a>).  We're glad to hear about your experience using Spark and to accept patches.</p>
<p>If you would like to report an issue, post it to the <a href="https://spark-project.atlassian.net/browse/SPARK">Spark issue tracker</a>.</p>

<p class="question">Where can I get more help?</p>
<p class="answer">Please post on the <a href="http://apache-spark-user-list.1001560.n3.nabble.com">Spark Users</a> mailing list.  We'll be glad to help!</p>

  </div>
</div>



<footer class="small">
  <hr>
  Apache Spark is an effort undergoing incubation at The Apache Software Foundation.
  <a href="http://incubator.apache.org/" style="border: none;">
    <img style="vertical-align: middle; float: right; margin-bottom: 15px;"
        src="/images/incubator-logo.png" alt="Apache Incubator" title="Apache Incubator" />
  </a>  
</footer>

</div>

<script src="https://code.jquery.com/jquery.js"></script>
<script src="//netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script>
<script src="/js/lang-tabs.js"></script>

</body>
</html>