Adding docs for 1.2.2 and 1.3.1

author: Patrick Wendell <pwendell@apache.org> 2015-04-17 05:52:53 +0000
committer: Patrick Wendell <pwendell@apache.org> 2015-04-17 05:52:53 +0000
commit: d49935af658ee525496c7ac598f7352c7793d6f8 (patch)
tree: 1813e84df9974a07d595ba24f0fe0e832f182f05 /site/docs/1.3.1/configuration.html
parent: 9695c4519404b689f29bc0fbc1a2eefa3eb33806 (diff)
download: spark-website-d49935af658ee525496c7ac598f7352c7793d6f8.tar.gz
spark-website-d49935af658ee525496c7ac598f7352c7793d6f8.tar.bz2
spark-website-d49935af658ee525496c7ac598f7352c7793d6f8.zip
1 files changed, 1641 insertions, 0 deletions
diff --git a/site/docs/1.3.1/configuration.html b/site/docs/1.3.1/configuration.html
new file mode 100644
index 000000000..43fcd2caf
--- /dev/null
+++ b/site/docs/1.3.1/configuration.html
@@ -0,0 +1,1641 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        <title>Configuration - Spark 1.3.1 Documentation</title>
+        
+
+        
+
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <style>
+            body {
+                padding-top: 60px;
+                padding-bottom: 40px;
+            }
+        </style>
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap-responsive.min.css">
+        <link rel="stylesheet" href="css/main.css">
+
+        <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
+
+        <link rel="stylesheet" href="css/pygments-default.css">
+
+        
+        <!-- Google analytics script -->
+        <script type="text/javascript">
+          var _gaq = _gaq || [];
+          _gaq.push(['_setAccount', 'UA-32518208-2']);
+          _gaq.push(['_trackPageview']);
+
+          (function() {
+            var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+            ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+            var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+          })();
+        </script>
+        
+
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
+
+        <div class="navbar navbar-fixed-top" id="topbar">
+            <div class="navbar-inner">
+                <div class="container">
+                    <div class="brand"><a href="index.html">
+                      <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">1.3.1</span>
+                    </div>
+                    <ul class="nav">
+                        <!--TODO(andyk): Add class="active" attribute to li some how.-->
+                        <li><a href="index.html">Overview</a></li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="quick-start.html">Quick Start</a></li>
+                                <li><a href="programming-guide.html">Spark Programming Guide</a></li>
+                                <li class="divider"></li>
+                                <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+                                <li><a href="sql-programming-guide.html">DataFrames and SQL</a></li>
+                                <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
+                                <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
+                                <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
+                                <li><a href="api/java/index.html">Java</a></li>
+                                <li><a href="api/python/index.html">Python</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="cluster-overview.html">Overview</a></li>
+                                <li><a href="submitting-applications.html">Submitting Applications</a></li>
+                                <li class="divider"></li>
+                                <li><a href="spark-standalone.html">Spark Standalone</a></li>
+                                <li><a href="running-on-mesos.html">Mesos</a></li>
+                                <li><a href="running-on-yarn.html">YARN</a></li>
+                                <li class="divider"></li>
+                                <li><a href="ec2-scripts.html">Amazon EC2</a></li>
+                            </ul>
+                        </li>
+
+                        <li class="dropdown">
+                            <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
+                            <ul class="dropdown-menu">
+                                <li><a href="configuration.html">Configuration</a></li>
+                                <li><a href="monitoring.html">Monitoring</a></li>
+                                <li><a href="tuning.html">Tuning Guide</a></li>
+                                <li><a href="job-scheduling.html">Job Scheduling</a></li>
+                                <li><a href="security.html">Security</a></li>
+                                <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
+                                <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
+                                <li class="divider"></li>
+                                <li><a href="building-spark.html">Building Spark</a></li>
+                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
+                            </ul>
+                        </li>
+                    </ul>
+                    <!--<p class="navbar-text pull-right"><span class="version-text">v1.3.1</span></p>-->
+                </div>
+            </div>
+        </div>
+
+        <div class="container" id="content">
+          
+            <h1 class="title">Spark Configuration</h1>
+          
+
+          <ul id="markdown-toc">
+  <li><a href="#spark-properties">Spark Properties</a>    <ul>
+      <li><a href="#dynamically-loading-spark-properties">Dynamically Loading Spark Properties</a></li>
+      <li><a href="#viewing-spark-properties">Viewing Spark Properties</a></li>
+      <li><a href="#available-properties">Available Properties</a>        <ul>
+          <li><a href="#application-properties">Application Properties</a></li>
+          <li><a href="#runtime-environment">Runtime Environment</a></li>
+          <li><a href="#shuffle-behavior">Shuffle Behavior</a></li>
+          <li><a href="#spark-ui">Spark UI</a></li>
+          <li><a href="#compression-and-serialization">Compression and Serialization</a></li>
+          <li><a href="#execution-behavior">Execution Behavior</a></li>
+          <li><a href="#networking">Networking</a></li>
+          <li><a href="#scheduling">Scheduling</a></li>
+          <li><a href="#dynamic-allocation">Dynamic Allocation</a></li>
+          <li><a href="#security">Security</a></li>
+          <li><a href="#encryption">Encryption</a></li>
+          <li><a href="#spark-streaming">Spark Streaming</a></li>
+          <li><a href="#cluster-managers">Cluster Managers</a></li>
+        </ul>
+      </li>
+    </ul>
+  </li>
+  <li><a href="#environment-variables">Environment Variables</a></li>
+  <li><a href="#configuring-logging">Configuring Logging</a></li>
+  <li><a href="#overriding-configuration-directory">Overriding configuration directory</a></li>
+</ul>
+
+<p>Spark provides three locations to configure the system:</p>
+
+<ul>
+  <li><a href="#spark-properties">Spark properties</a> control most application parameters and can be set by using
+a <a href="api/scala/index.html#org.apache.spark.SparkConf">SparkConf</a> object, or through Java
+system properties.</li>
+  <li><a href="#environment-variables">Environment variables</a> can be used to set per-machine settings, such as
+the IP address, through the <code>conf/spark-env.sh</code> script on each node.</li>
+  <li><a href="#configuring-logging">Logging</a> can be configured through <code>log4j.properties</code>.</li>
+</ul>
+
+<h1 id="spark-properties">Spark Properties</h1>
+
+<p>Spark properties control most application settings and are configured separately for each
+application. These properties can be set directly on a
+<a href="api/scala/index.html#org.apache.spark.SparkConf">SparkConf</a> passed to your
+<code>SparkContext</code>. <code>SparkConf</code> allows you to configure some of the common properties
+(e.g. master URL and application name), as well as arbitrary key-value pairs through the
+<code>set()</code> method. For example, we could initialize an application with two threads as follows:</p>
+
+<p>Note that we run with local[2], meaning two threads - which represents &#8220;minimal&#8221; parallelism,
+which can help detect bugs that only exist when we run in a distributed context.</p>
+
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">conf</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SparkConf</span><span class="o">()</span>
+             <span class="o">.</span><span class="n">setMaster</span><span class="o">(</span><span class="s">&quot;local[2]&quot;</span><span class="o">)</span>
+             <span class="o">.</span><span class="n">setAppName</span><span class="o">(</span><span class="s">&quot;CountingSheep&quot;</span><span class="o">)</span>
+             <span class="o">.</span><span class="n">set</span><span class="o">(</span><span class="s">&quot;spark.executor.memory&quot;</span><span class="o">,</span> <span class="s">&quot;1g&quot;</span><span class="o">)</span>
+<span class="k">val</span> <span class="n">sc</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SparkContext</span><span class="o">(</span><span class="n">conf</span><span class="o">)</span></code></pre></div>
+
+<p>Note that we can have more than 1 thread in local mode, and in cases like spark streaming, we may actually
+require one to prevent any sort of starvation issues.</p>
+
+<h2 id="dynamically-loading-spark-properties">Dynamically Loading Spark Properties</h2>
+<p>In some cases, you may want to avoid hard-coding certain configurations in a <code>SparkConf</code>. For
+instance, if you&#8217;d like to run the same application with different masters or different
+amounts of memory. Spark allows you to simply create an empty conf:</p>
+
+<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">sc</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SparkContext</span><span class="o">(</span><span class="k">new</span> <span class="nc">SparkConf</span><span class="o">())</span></code></pre></div>
+
+<p>Then, you can supply configuration values at runtime:</p>
+
+<div class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/spark-submit --name <span class="s2">&quot;My app&quot;</span> --master <span class="nb">local</span><span class="o">[</span>4<span class="o">]</span> --conf spark.shuffle.spill<span class="o">=</span><span class="nb">false</span>
+  --conf <span class="s2">&quot;spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps&quot;</span> myApp.jar</code></pre></div>
+
+<p>The Spark shell and <a href="submitting-applications.html"><code>spark-submit</code></a>
+tool support two ways to load configurations dynamically. The first are command line options,
+such as <code>--master</code>, as shown above. <code>spark-submit</code> can accept any Spark property using the <code>--conf</code>
+flag, but uses special flags for properties that play a part in launching the Spark application.
+Running <code>./bin/spark-submit --help</code> will show the entire list of these options.</p>
+
+<p><code>bin/spark-submit</code> will also read configuration options from <code>conf/spark-defaults.conf</code>, in which
+each line consists of a key and a value separated by whitespace. For example:</p>
+
+<pre><code>spark.master            spark://5.6.7.8:7077
+spark.executor.memory   512m
+spark.eventLog.enabled  true
+spark.serializer        org.apache.spark.serializer.KryoSerializer
+</code></pre>
+
+<p>Any values specified as flags or in the properties file will be passed on to the application
+and merged with those specified through SparkConf. Properties set directly on the SparkConf
+take highest precedence, then flags passed to <code>spark-submit</code> or <code>spark-shell</code>, then options
+in the <code>spark-defaults.conf</code> file. A few configuration keys have been renamed since earlier
+versions of Spark; in such cases, the older key names are still accepted, but take lower
+precedence than any instance of the newer key.</p>
+
+<h2 id="viewing-spark-properties">Viewing Spark Properties</h2>
+
+<p>The application web UI at <code>http://&lt;driver&gt;:4040</code> lists Spark properties in the &#8220;Environment&#8221; tab.
+This is a useful place to check to make sure that your properties have been set correctly. Note
+that only values explicitly specified through <code>spark-defaults.conf</code>, <code>SparkConf</code>, or the command
+line will appear. For all other configuration properties, you can assume the default value is used.</p>
+
+<h2 id="available-properties">Available Properties</h2>
+
+<p>Most of the properties that control internal settings have reasonable default values. Some
+of the most common options to set are:</p>
+
+<h4 id="application-properties">Application Properties</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.app.name</code></td>
+  <td>(none)</td>
+  <td>
+    The name of your application. This will appear in the UI and in log data.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.driver.cores</code></td>
+  <td>1</td>
+  <td>
+    Number of cores to use for the driver process, only in cluster mode.
+  </td>
+</tr>
+  <td><code>spark.driver.maxResultSize</code></td>
+  <td>1g</td>
+  <td>
+    Limit of total size of serialized results of all partitions for each Spark action (e.g. collect).
+    Should be at least 1M, or 0 for unlimited. Jobs will be aborted if the total size
+    is above this limit.
+    Having a high limit may cause out-of-memory errors in driver (depends on spark.driver.memory
+    and memory overhead of objects in JVM). Setting a proper limit can protect the driver from
+    out-of-memory errors.
+  </td>
+
+<tr>
+  <td><code>spark.driver.memory</code></td>
+  <td>512m</td>
+  <td>
+    Amount of memory to use for the driver process, i.e. where SparkContext is initialized.
+    (e.g. <code>512m</code>, <code>2g</code>).
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-memory</code> command line option
+    or in your default properties file.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.memory</code></td>
+  <td>512m</td>
+  <td>
+    Amount of memory to use per executor process, in the same format as JVM memory strings
+    (e.g. <code>512m</code>, <code>2g</code>).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.extraListeners</code></td>
+  <td>(none)</td>
+  <td>
+    A comma-separated list of classes that implement <code>SparkListener</code>; when initializing
+    SparkContext, instances of these classes will be created and registered with Spark's listener
+    bus.  If a class has a single-argument constructor that accepts a SparkConf, that constructor
+    will be called; otherwise, a zero-argument constructor will be called. If no valid constructor
+    can be found, the SparkContext creation will fail with an exception.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.local.dir</code></td>
+  <td>/tmp</td>
+  <td>
+    Directory to use for "scratch" space in Spark, including map output files and RDDs that get
+    stored on disk. This should be on a fast, local disk in your system. It can also be a
+    comma-separated list of multiple directories on different disks.
+
+    NOTE: In Spark 1.0 and later this will be overriden by SPARK_LOCAL_DIRS (Standalone, Mesos) or
+    LOCAL_DIRS (YARN) environment variables set by the cluster manager.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.logConf</code></td>
+  <td>false</td>
+  <td>
+    Logs the effective SparkConf as INFO when a SparkContext is started.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.master</code></td>
+  <td>(none)</td>
+  <td>
+    The cluster manager to connect to. See the list of
+    <a href="submitting-applications.html#master-urls"> allowed master URL's</a>.
+  </td>
+</tr>
+</table>
+
+<p>Apart from these, the following properties are also available, and may be useful in some situations:</p>
+
+<h4 id="runtime-environment">Runtime Environment</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.driver.extraClassPath</code></td>
+  <td>(none)</td>
+  <td>
+    Extra classpath entries to append to the classpath of the driver.
+
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-class-path</code> command line option or in 
+    your default properties file.</td>
+  
+</tr>
+<tr>
+  <td><code>spark.driver.extraJavaOptions</code></td>
+  <td>(none)</td>
+  <td>
+    A string of extra JVM options to pass to the driver. For instance, GC settings or other logging.
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-java-options</code> command line option or in 
+    your default properties file.</td>
+  
+</tr>
+<tr>
+  <td><code>spark.driver.extraLibraryPath</code></td>
+  <td>(none)</td>
+  <td>
+    Set a special library path to use when launching the driver JVM.
+    
+    <br /><em>Note:</em> In client mode, this config must not be set through the <code>SparkConf</code>
+    directly in your application, because the driver JVM has already started at that point.
+    Instead, please set this through the <code>--driver-library-path</code> command line option or in 
+    your default properties file.</td>
+  
+</tr>
+<tr>
+  <td><code>spark.driver.userClassPathFirst</code></td>
+  <td>false</td>
+  <td>
+    (Experimental) Whether to give user-added jars precedence over Spark's own jars when loading
+    classes in the the driver. This feature can be used to mitigate conflicts between Spark's
+    dependencies and user dependencies. It is currently an experimental feature.
+    
+    This is used in cluster mode only.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.extraClassPath</code></td>
+  <td>(none)</td>
+  <td>
+    Extra classpath entries to append to the classpath of executors. This exists primarily for 
+    backwards-compatibility with older versions of Spark. Users typically should not need to set 
+    this option.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.extraJavaOptions</code></td>
+  <td>(none)</td>
+  <td>
+    A string of extra JVM options to pass to executors. For instance, GC settings or other logging. 
+    Note that it is illegal to set Spark properties or heap size settings with this option. Spark 
+    properties should be set using a SparkConf object or the spark-defaults.conf file used with the 
+    spark-submit script. Heap size settings can be set with spark.executor.memory.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.extraLibraryPath</code></td>
+  <td>(none)</td>
+  <td>
+    Set a special library path to use when launching executor JVM's.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.maxRetainedFiles</code></td>
+  <td>(none)</td>
+  <td>
+    Sets the number of latest rolling log files that are going to be retained by the system.
+    Older log files will be deleted. Disabled by default.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.size.maxBytes</code></td>
+  <td>(none)</td>
+  <td>
+    Set the max size of the file by which the executor logs will be rolled over.
+    Rolling is disabled by default. Value is set in terms of bytes.
+    See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
+    for automatic cleaning of old logs.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.strategy</code></td>
+  <td>(none)</td>
+  <td>
+    Set the strategy of rolling of executor logs. By default it is disabled. It can
+    be set to "time" (time-based rolling) or "size" (size-based rolling). For "time",
+    use <code>spark.executor.logs.rolling.time.interval</code> to set the rolling interval.
+    For "size", use <code>spark.executor.logs.rolling.size.maxBytes</code> to set
+    the maximum file size for rolling.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.logs.rolling.time.interval</code></td>
+  <td>daily</td>
+  <td>
+    Set the time interval by which the executor logs will be rolled over.
+    Rolling is disabled by default. Valid values are `daily`, `hourly`, `minutely` or
+    any interval in seconds. See <code>spark.executor.logs.rolling.maxRetainedFiles</code>
+    for automatic cleaning of old logs.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.userClassPathFirst</code></td>
+  <td>false</td>
+  <td>
+    (Experimental) Same functionality as <code>spark.driver.userClassPathFirst</code>, but
+    applied to executor instances.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executorEnv.[EnvironmentVariableName]</code></td>
+  <td>(none)</td>
+  <td>
+    Add the environment variable specified by <code>EnvironmentVariableName</code> to the Executor
+    process. The user can specify multiple of these to set multiple environment variables.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.python.profile</code></td>
+  <td>false</td>
+  <td>
+    Enable profiling in Python worker, the profile result will show up by `sc.show_profiles()`,
+    or it will be displayed before the driver exiting. It also can be dumped into disk by
+    `sc.dump_profiles(path)`. If some of the profile results had been displayed maually,
+    they will not be displayed automatically before driver exiting.
+
+    By default the `pyspark.profiler.BasicProfiler` will be used, but this can be overridden by
+    passing a profiler class in as a parameter to the `SparkContext` constructor.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.python.profile.dump</code></td>
+  <td>(none)</td>
+  <td>
+    The directory which is used to dump the profile result before driver exiting.
+    The results will be dumped as separated file for each RDD. They can be loaded
+    by ptats.Stats(). If this is specified, the profile result will not be displayed
+    automatically.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.python.worker.memory</code></td>
+  <td>512m</td>
+  <td>
+    Amount of memory to use per python worker process during aggregation, in the same
+    format as JVM memory strings (e.g. <code>512m</code>, <code>2g</code>). If the memory
+    used during aggregation goes above this amount, it will spill the data into disks.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.python.worker.reuse</code></td>
+  <td>true</td>
+  <td>
+    Reuse Python worker or not. If yes, it will use a fixed number of Python workers,
+    does not need to fork() a Python process for every tasks. It will be very useful
+    if there is large broadcast, then the broadcast will not be needed to transfered
+    from JVM to Python worker for every task.
+  </td>
+</tr>
+</table>
+
+<h4 id="shuffle-behavior">Shuffle Behavior</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.reducer.maxMbInFlight</code></td>
+  <td>48</td>
+  <td>
+    Maximum size (in megabytes) of map outputs to fetch simultaneously from each reduce task. Since
+    each output requires us to create a buffer to receive it, this represents a fixed memory
+    overhead per reduce task, so keep it small unless you have a large amount of memory.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.blockTransferService</code></td>
+  <td>netty</td>
+  <td>
+    Implementation to use for transferring shuffle and cached blocks between executors. There
+    are two implementations available: <code>netty</code> and <code>nio</code>. Netty-based
+    block transfer is intended to be simpler but equally efficient and is the default option
+    starting in 1.2.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.compress</code></td>
+  <td>true</td>
+  <td>
+    Whether to compress map output files. Generally a good idea. Compression will use
+    <code>spark.io.compression.codec</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.consolidateFiles</code></td>
+  <td>false</td>
+  <td>
+    If set to "true", consolidates intermediate files created during a shuffle. Creating fewer
+    files can improve filesystem performance for shuffles with large numbers of reduce tasks. It
+    is recommended to set this to "true" when using ext4 or xfs filesystems. On ext3, this option
+    might degrade performance on machines with many (&gt;8) cores due to filesystem limitations.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.file.buffer.kb</code></td>
+  <td>32</td>
+  <td>
+    Size of the in-memory buffer for each shuffle file output stream, in kilobytes. These buffers
+    reduce the number of disk seeks and system calls made in creating intermediate shuffle files.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.maxRetries</code></td>
+  <td>3</td>
+  <td>
+    (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is
+    set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC
+    pauses or transient network connectivity issues.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.numConnectionsPerPeer</code></td>
+  <td>1</td>
+  <td>
+    (Netty only) Connections between hosts are reused in order to reduce connection buildup for
+    large clusters. For clusters with many hard disks and few hosts, this may result in insufficient
+    concurrency to saturate all disks, and so users may consider increasing this value.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.preferDirectBufs</code></td>
+  <td>true</td>
+  <td>
+    (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache
+    block transfer. For environments where off-heap memory is tightly limited, users may wish to
+    turn this off to force all allocations from Netty to be on-heap.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.io.retryWait</code></td>
+  <td>5</td>
+  <td>
+    (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying
+    is simply <code>maxRetries * retryWait</code>, by default 15 seconds.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.manager</code></td>
+  <td>sort</td>
+  <td>
+    Implementation to use for shuffling data. There are two implementations available:
+    <code>sort</code> and <code>hash</code>. Sort-based shuffle is more memory-efficient and is
+    the default option starting in 1.2.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.memoryFraction</code></td>
+  <td>0.2</td>
+  <td>
+    Fraction of Java heap to use for aggregation and cogroups during shuffles, if
+    <code>spark.shuffle.spill</code> is true. At any given time, the collective size of
+    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
+    begin to spill to disk. If spills are often, consider increasing this value at the expense of
+    <code>spark.storage.memoryFraction</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.sort.bypassMergeThreshold</code></td>
+  <td>200</td>
+  <td>
+    (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no
+    map-side aggregation and there are at most this many reduce partitions.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.spill</code></td>
+  <td>true</td>
+  <td>
+    If set to "true", limits the amount of memory used during reduces by spilling data out to disk.
+    This spilling threshold is specified by <code>spark.shuffle.memoryFraction</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.spill.compress</code></td>
+  <td>true</td>
+  <td>
+    Whether to compress data spilled during shuffles. Compression will use
+    <code>spark.io.compression.codec</code>.
+  </td>
+</tr>
+</table>
+
+<h4 id="spark-ui">Spark UI</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.eventLog.compress</code></td>
+  <td>false</td>
+  <td>
+    Whether to compress logged events, if <code>spark.eventLog.enabled</code> is true.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.eventLog.dir</code></td>
+  <td>file:///tmp/spark-events</td>
+  <td>
+    Base directory in which Spark events are logged, if <code>spark.eventLog.enabled</code> is true.
+    Within this base directory, Spark creates a sub-directory for each application, and logs the
+    events specific to the application in this directory. Users may want to set this to
+    a unified location like an HDFS directory so history files can be read by the history server.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.eventLog.enabled</code></td>
+  <td>false</td>
+  <td>
+    Whether to log Spark events, useful for reconstructing the Web UI after the application has
+    finished.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.killEnabled</code></td>
+  <td>true</td>
+  <td>
+    Allows stages and corresponding jobs to be killed from the web ui.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.port</code></td>
+  <td>4040</td>
+  <td>
+    Port for your application's dashboard, which shows memory and workload data.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.retainedJobs</code></td>
+  <td>1000</td>
+  <td>
+    How many jobs the Spark UI and status APIs remember before garbage
+    collecting.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.retainedStages</code></td>
+  <td>1000</td>
+  <td>
+    How many stages the Spark UI and status APIs remember before garbage
+    collecting.
+  </td>
+</tr>
+</table>
+
+<h4 id="compression-and-serialization">Compression and Serialization</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.broadcast.compress</code></td>
+  <td>true</td>
+  <td>
+    Whether to compress broadcast variables before sending them. Generally a good idea.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.closure.serializer</code></td>
+  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
+  <td>
+    Serializer class to use for closures. Currently only the Java serializer is supported.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.io.compression.codec</code></td>
+  <td>snappy</td>
+  <td>
+    The codec used to compress internal data such as RDD partitions, broadcast variables and
+    shuffle outputs. By default, Spark provides three codecs: <code>lz4</code>, <code>lzf</code>,
+    and <code>snappy</code>. You can also use fully qualified class names to specify the codec,
+    e.g.
+    <code>org.apache.spark.io.LZ4CompressionCodec</code>,
+    <code>org.apache.spark.io.LZFCompressionCodec</code>,
+    and <code>org.apache.spark.io.SnappyCompressionCodec</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.io.compression.lz4.block.size</code></td>
+  <td>32768</td>
+  <td>
+    Block size (in bytes) used in LZ4 compression, in the case when LZ4 compression codec
+    is used. Lowering this block size will also lower shuffle memory usage when LZ4 is used.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.io.compression.snappy.block.size</code></td>
+  <td>32768</td>
+  <td>
+    Block size (in bytes) used in Snappy compression, in the case when Snappy compression codec
+    is used. Lowering this block size will also lower shuffle memory usage when Snappy is used.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryo.classesToRegister</code></td>
+  <td>(none)</td>
+  <td>
+    If you use Kryo serialization, give a comma-separated list of custom class names to register
+    with Kryo.
+    See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryo.referenceTracking</code></td>
+  <td>true</td>
+  <td>
+    Whether to track references to the same object when serializing data with Kryo, which is
+    necessary if your object graphs have loops and useful for efficiency if they contain multiple
+    copies of the same object. Can be disabled to improve performance if you know this is not the
+    case.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryo.registrationRequired</code></td>
+  <td>false</td>
+  <td>
+    Whether to require registration with Kryo. If set to 'true', Kryo will throw an exception
+    if an unregistered class is serialized. If set to false (the default), Kryo will write
+    unregistered class names along with each object. Writing class names can cause
+    significant performance overhead, so enabling this option can enforce strictly that a
+    user has not omitted classes from registration.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryo.registrator</code></td>
+  <td>(none)</td>
+  <td>
+    If you use Kryo serialization, set this class to register your custom classes with Kryo. This
+    property is useful if you need to register your classes in a custom way, e.g. to specify a custom
+    field serializer. Otherwise <code>spark.kryo.classesToRegister</code> is simpler. It should be
+    set to a class that extends
+    <a href="api/scala/index.html#org.apache.spark.serializer.KryoRegistrator">
+    <code>KryoRegistrator</code></a>.
+    See the <a href="tuning.html#data-serialization">tuning guide</a> for more details.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryoserializer.buffer.max.mb</code></td>
+  <td>64</td>
+  <td>
+    Maximum allowable size of Kryo serialization buffer, in megabytes. This must be larger than any
+    object you attempt to serialize. Increase this if you get a "buffer limit exceeded" exception
+    inside Kryo.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kryoserializer.buffer.mb</code></td>
+  <td>0.064</td>
+  <td>
+    Initial size of Kryo's serialization buffer, in megabytes. Note that there will be one buffer
+     <i>per core</i> on each worker. This buffer will grow up to
+     <code>spark.kryoserializer.buffer.max.mb</code> if needed.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.rdd.compress</code></td>
+  <td>false</td>
+  <td>
+    Whether to compress serialized RDD partitions (e.g. for
+    <code>StorageLevel.MEMORY_ONLY_SER</code>). Can save substantial space at the cost of some
+    extra CPU time.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.serializer</code></td>
+  <td>org.apache.spark.serializer.<br />JavaSerializer</td>
+  <td>
+    Class to use for serializing objects that will be sent over the network or need to be cached
+    in serialized form. The default of Java serialization works with any Serializable Java object
+    but is quite slow, so we recommend <a href="tuning.html">using
+    <code>org.apache.spark.serializer.KryoSerializer</code> and configuring Kryo serialization</a>
+    when speed is necessary. Can be any subclass of
+    <a href="api/scala/index.html#org.apache.spark.serializer.Serializer">
+    <code>org.apache.spark.Serializer</code></a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.serializer.objectStreamReset</code></td>
+  <td>100</td>
+  <td>
+    When serializing using org.apache.spark.serializer.JavaSerializer, the serializer caches
+    objects to prevent writing redundant data, however that stops garbage collection of those
+    objects. By calling 'reset' you flush that info from the serializer, and allow old
+    objects to be collected. To turn off this periodic reset set it to -1.
+    By default it will reset the serializer every 100 objects.
+  </td>
+</tr>
+</table>
+
+<h4 id="execution-behavior">Execution Behavior</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.broadcast.blockSize</code></td>
+  <td>4096</td>
+  <td>
+    Size of each piece of a block in kilobytes for <code>TorrentBroadcastFactory</code>.
+    Too large a value decreases parallelism during broadcast (makes it slower); however, if it is
+    too small, <code>BlockManager</code> might take a performance hit.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.broadcast.factory</code></td>
+  <td>org.apache.spark.broadcast.<br />TorrentBroadcastFactory</td>
+  <td>
+    Which broadcast implementation to use.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.cleaner.ttl</code></td>
+  <td>(infinite)</td>
+  <td>
+    Duration (seconds) of how long Spark will remember any metadata (stages generated, tasks
+    generated, etc.). Periodic cleanups will ensure that metadata older than this duration will be
+    forgotten. This is useful for running Spark for many hours / days (for example, running 24/7 in
+    case of Spark Streaming applications). Note that any RDD that persists in memory for more than
+    this duration will be cleared as well.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.default.parallelism</code></td>
+  <td>
+    For distributed shuffle operations like <code>reduceByKey</code> and <code>join</code>, the
+    largest number of partitions in a parent RDD.  For operations like <code>parallelize</code>
+    with no parent RDDs, it depends on the cluster manager:
+    <ul>
+      <li>Local mode: number of cores on the local machine</li>
+      <li>Mesos fine grained mode: 8</li>
+      <li>Others: total number of cores on all executor nodes or 2, whichever is larger</li>
+    </ul>
+  </td>
+  <td>
+    Default number of partitions in RDDs returned by transformations like <code>join</code>,
+    <code>reduceByKey</code>, and <code>parallelize</code> when not set by user.
+  </td>
+</tr>
+<tr>
+    <td><code>spark.executor.heartbeatInterval</code></td>
+    <td>10000</td>
+    <td>Interval (milliseconds) between each executor's heartbeats to the driver.  Heartbeats let
+    the driver know that the executor is still alive and update it with metrics for in-progress
+    tasks.</td>
+</tr>
+<tr>
+  <td><code>spark.files.fetchTimeout</code></td>
+  <td>60</td>
+  <td>
+    Communication timeout to use when fetching files added through SparkContext.addFile() from
+    the driver, in seconds.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.files.useFetchCache</code></td>
+  <td>true</td>
+  <td>
+    If set to true (default), file fetching will use a local cache that is shared by executors
+    that belong to the same application, which can improve task launching performance when
+    running many executors on the same host. If set to false, these caching optimizations will
+    be disabled and all executors will fetch their own copies of files. This optimization may be
+    disabled in order to use Spark local directories that reside on NFS filesystems (see
+    <a href="https://issues.apache.org/jira/browse/SPARK-6313">SPARK-6313</a> for more details).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.files.overwrite</code></td>
+  <td>false</td>
+  <td>
+    Whether to overwrite files added through SparkContext.addFile() when the target file exists and
+    its contents do not match those of the source.
+  </td>
+</tr>
+<tr>
+    <td><code>spark.hadoop.cloneConf</code></td>
+    <td>false</td>
+    <td>If set to true, clones a new Hadoop <code>Configuration</code> object for each task.  This
+    option should be enabled to work around <code>Configuration</code> thread-safety issues (see
+    <a href="https://issues.apache.org/jira/browse/SPARK-2546">SPARK-2546</a> for more details).
+    This is disabled by default in order to avoid unexpected performance regressions for jobs that
+    are not affected by these issues.</td>
+</tr>
+<tr>
+    <td><code>spark.hadoop.validateOutputSpecs</code></td>
+    <td>true</td>
+    <td>If set to true, validates the output specification (e.g. checking if the output directory already exists)
+    used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing
+    output directories. We recommend that users do not disable this except if trying to achieve compatibility with
+    previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand.
+    This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since
+    data may need to be rewritten to pre-existing output directories during checkpoint recovery.</td>
+</tr>
+<tr>
+  <td><code>spark.storage.memoryFraction</code></td>
+  <td>0.6</td>
+  <td>
+    Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old"
+    generation of objects in the JVM, which by default is given 0.6 of the heap, but you can
+    increase it if you configure your own old generation size.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.storage.memoryMapThreshold</code></td>
+  <td>2097152</td>
+  <td>
+    Size of a block, in bytes, above which Spark memory maps when reading a block from disk.
+    This prevents Spark from memory mapping very small blocks. In general, memory
+    mapping has high overhead for blocks close to or below the page size of the operating system.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.storage.unrollFraction</code></td>
+  <td>0.2</td>
+  <td>
+    Fraction of <code>spark.storage.memoryFraction</code> to use for unrolling blocks in memory.
+    This is dynamically allocated by dropping existing blocks when there is not enough free
+    storage space to unroll the new block in its entirety.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.tachyonStore.baseDir</code></td>
+  <td>System.getProperty("java.io.tmpdir")</td>
+  <td>
+    Directories of the Tachyon File System that store RDDs. The Tachyon file system's URL is set by
+    <code>spark.tachyonStore.url</code>. It can also be a comma-separated list of multiple
+    directories on Tachyon file system.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.tachyonStore.url</code></td>
+  <td>tachyon://localhost:19998</td>
+  <td>
+    The URL of the underlying Tachyon file system in the TachyonStore.
+  </td>
+</tr>
+</table>
+
+<h4 id="networking">Networking</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.akka.failure-detector.threshold</code></td>
+  <td>300.0</td>
+  <td>
+     This is set to a larger value to disable failure detector that comes inbuilt akka. It can be
+     enabled again, if you plan to use this feature (Not recommended). This maps to akka's
+     `akka.remote.transport-failure-detector.threshold`. Tune this in combination of
+     `spark.akka.heartbeat.pauses` and `spark.akka.heartbeat.interval` if you need to.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.akka.frameSize</code></td>
+  <td>10</td>
+  <td>
+    Maximum message size to allow in "control plane" communication (for serialized tasks and task
+    results), in MB. Increase this if your tasks need to send back large results to the driver
+    (e.g. using <code>collect()</code> on a large dataset).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.akka.heartbeat.interval</code></td>
+  <td>1000</td>
+  <td>
+    This is set to a larger value to disable the transport failure detector that comes built in to 
+    Akka. It can be enabled again, if you plan to use this feature (Not recommended). A larger 
+    interval value in seconds reduces network overhead and a smaller value ( ~ 1 s) might be more 
+    informative for Akka's failure detector. Tune this in combination of `spark.akka.heartbeat.pauses` 
+    if you need to. A likely positive use case for using failure detector would be: a sensistive 
+    failure detector can help evict rogue executors quickly. However this is usually not the case 
+    as GC pauses and network lags are expected in a real Spark cluster. Apart from that enabling 
+    this leads to a lot of exchanges of heart beats between nodes leading to flooding the network 
+    with those.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.akka.heartbeat.pauses</code></td>
+  <td>6000</td>
+  <td>
+     This is set to a larger value to disable the transport failure detector that comes built in to Akka.
+     It can be enabled again, if you plan to use this feature (Not recommended). Acceptable heart 
+     beat pause in seconds for Akka. This can be used to control sensitivity to GC pauses. Tune
+     this along with `spark.akka.heartbeat.interval` if you need to.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.akka.threads</code></td>
+  <td>4</td>
+  <td>
+    Number of actor threads to use for communication. Can be useful to increase on large clusters
+    when the driver has a lot of CPU cores.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.akka.timeout</code></td>
+  <td>100</td>
+  <td>
+    Communication timeout between Spark nodes, in seconds.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.blockManager.port</code></td>
+  <td>(random)</td>
+  <td>
+    Port for all block managers to listen on. These exist on both the driver and the executors.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.broadcast.port</code></td>
+  <td>(random)</td>
+  <td>
+    Port for the driver's HTTP broadcast server to listen on.
+    This is not relevant for torrent broadcast.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.driver.host</code></td>
+  <td>(local hostname)</td>
+  <td>
+    Hostname or IP address for the driver to listen on.
+    This is used for communicating with the executors and the standalone Master.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.driver.port</code></td>
+  <td>(random)</td>
+  <td>
+    Port for the driver to listen on.
+    This is used for communicating with the executors and the standalone Master.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.executor.port</code></td>
+  <td>(random)</td>
+  <td>
+    Port for the executor to listen on. This is used for communicating with the driver.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.fileserver.port</code></td>
+  <td>(random)</td>
+  <td>
+    Port for the driver's HTTP file server to listen on.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.network.timeout</code></td>
+  <td>120</td>
+  <td>
+    Default timeout for all network interactions, in seconds. This config will be used in
+    place of <code>spark.core.connection.ack.wait.timeout</code>, <code>spark.akka.timeout</code>,
+    <code>spark.storage.blockManagerSlaveTimeoutMs</code> or
+    <code>spark.shuffle.io.connectionTimeout</code>, if they are not configured.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.port.maxRetries</code></td>
+  <td>16</td>
+  <td>
+    Default maximum number of retries when binding to a port before giving up.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.replClassServer.port</code></td>
+  <td>(random)</td>
+  <td>
+    Port for the driver's HTTP class server to listen on.
+    This is only relevant for the Spark shell.
+  </td>
+</tr>
+</table>
+
+<h4 id="scheduling">Scheduling</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.cores.max</code></td>
+  <td>(not set)</td>
+  <td>
+    When running on a <a href="spark-standalone.html">standalone deploy cluster</a> or a
+    <a href="running-on-mesos.html#mesos-run-modes">Mesos cluster in "coarse-grained"
+    sharing mode</a>, the maximum amount of CPU cores to request for the application from
+    across the cluster (not from each machine). If not set, the default will be
+    <code>spark.deploy.defaultCores</code> on Spark's standalone cluster manager, or
+    infinite (all available cores) on Mesos.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.localExecution.enabled</code></td>
+  <td>false</td>
+  <td>
+    Enables Spark to run certain jobs, such as first() or take() on the driver, without sending
+    tasks to the cluster. This can make certain jobs execute very quickly, but may require
+    shipping a whole partition of data to the driver.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.locality.wait</code></td>
+  <td>3000</td>
+  <td>
+    Number of milliseconds to wait to launch a data-local task before giving up and launching it
+    on a less-local node. The same wait will be used to step through multiple locality levels
+    (process-local, node-local, rack-local and then any). It is also possible to customize the
+    waiting time for each level by setting <code>spark.locality.wait.node</code>, etc.
+    You should increase this setting if your tasks are long and see poor locality, but the
+    default usually works well.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.locality.wait.node</code></td>
+  <td>spark.locality.wait</td>
+  <td>
+    Customize the locality wait for node locality. For example, you can set this to 0 to skip
+    node locality and search immediately for rack locality (if your cluster has rack information).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.locality.wait.process</code></td>
+  <td>spark.locality.wait</td>
+  <td>
+    Customize the locality wait for process locality. This affects tasks that attempt to access
+    cached data in a particular executor process.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.locality.wait.rack</code></td>
+  <td>spark.locality.wait</td>
+  <td>
+    Customize the locality wait for rack locality.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.scheduler.maxRegisteredResourcesWaitingTime</code></td>
+  <td>30000</td>
+  <td>
+    Maximum amount of time to wait for resources to register before scheduling begins
+    (in milliseconds).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.scheduler.minRegisteredResourcesRatio</code></td>
+  <td>0.8 for YARN mode; 0.0 otherwise</td>
+  <td>
+    The minimum ratio of registered resources (registered resources / total expected resources)
+    (resources are executors in yarn mode, CPU cores in standalone mode)
+    to wait for before scheduling begins. Specified as a double between 0.0 and 1.0.
+    Regardless of whether the minimum ratio of resources has been reached,
+    the maximum amount of time it will wait before scheduling begins is controlled by config
+    <code>spark.scheduler.maxRegisteredResourcesWaitingTime</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.scheduler.mode</code></td>
+  <td>FIFO</td>
+  <td>
+    The <a href="job-scheduling.html#scheduling-within-an-application">scheduling mode</a> between
+    jobs submitted to the same SparkContext. Can be set to <code>FAIR</code>
+    to use fair sharing instead of queueing jobs one after another. Useful for
+    multi-user services.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.scheduler.revive.interval</code></td>
+  <td>1000</td>
+  <td>
+    The interval length for the scheduler to revive the worker resource offers to run tasks
+    (in milliseconds).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation</code></td>
+  <td>false</td>
+  <td>
+    If set to "true", performs speculative execution of tasks. This means if one or more tasks are
+    running slowly in a stage, they will be re-launched.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation.interval</code></td>
+  <td>100</td>
+  <td>
+    How often Spark will check for tasks to speculate, in milliseconds.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation.multiplier</code></td>
+  <td>1.5</td>
+  <td>
+    How many times slower a task is than the median to be considered for speculation.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.speculation.quantile</code></td>
+  <td>0.75</td>
+  <td>
+    Percentage of tasks which must be complete before speculation is enabled for a particular stage.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.cpus</code></td>
+  <td>1</td>
+  <td>
+    Number of cores to allocate for each task.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.task.maxFailures</code></td>
+  <td>4</td>
+  <td>
+    Number of individual task failures before giving up on the job.
+    Should be greater than or equal to 1. Number of allowed retries = this value - 1.
+  </td>
+</tr>
+</table>
+
+<h4 id="dynamic-allocation">Dynamic Allocation</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.dynamicAllocation.enabled</code></td>
+  <td>false</td>
+  <td>
+    Whether to use dynamic resource allocation, which scales the number of executors registered
+    with this application up and down based on the workload. Note that this is currently only
+    available on YARN mode. For more detail, see the description
+    <a href="job-scheduling.html#dynamic-resource-allocation">here</a>.
+    <br /><br />
+    This requires <code>spark.shuffle.service.enabled</code> to be set.
+    The following configurations are also relevant:
+    <code>spark.dynamicAllocation.minExecutors</code>,
+    <code>spark.dynamicAllocation.maxExecutors</code>, and
+    <code>spark.dynamicAllocation.initialExecutors</code>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.executorIdleTimeout</code></td>
+  <td>600</td>
+  <td>
+    If dynamic allocation is enabled and an executor has been idle for more than this duration
+    (in seconds), the executor will be removed. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.initialExecutors</code></td>
+  <td><code>spark.dynamicAllocation.minExecutors</code></td>
+  <td>
+    Initial number of executors to run if dynamic allocation is enabled.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.maxExecutors</code></td>
+  <td>Integer.MAX_VALUE</td>
+  <td>
+    Upper bound for the number of executors if dynamic allocation is enabled.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.minExecutors</code></td>
+  <td>0</td>
+  <td>
+    Lower bound for the number of executors if dynamic allocation is enabled.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.schedulerBacklogTimeout</code></td>
+  <td>5</td>
+  <td>
+    If dynamic allocation is enabled and there have been pending tasks backlogged for more than
+    this duration (in seconds), new executors will be requested. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.dynamicAllocation.sustainedSchedulerBacklogTimeout</code></td>
+  <td><code>schedulerBacklogTimeout</code></td>
+  <td>
+    Same as <code>spark.dynamicAllocation.schedulerBacklogTimeout</code>, but used only for
+    subsequent executor requests. For more detail, see this
+    <a href="job-scheduling.html#resource-allocation-policy">description</a>.
+  </td>
+</tr>
+</table>
+
+<h4 id="security">Security</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.acls.enable</code></td>
+  <td>false</td>
+  <td>
+    Whether Spark acls should are enabled. If enabled, this checks to see if the user has
+    access permissions to view or modify the job.  Note this requires the user to be known,
+    so if the user comes across as null no checks are done. Filters can be used with the UI
+    to authenticate and set the user.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.admin.acls</code></td>
+  <td>Empty</td>
+  <td>
+    Comma separated list of users/administrators that have view and modify access to all Spark jobs.
+    This can be used if you run on a shared cluster and have a set of administrators or devs who
+    help debug when things work.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.authenticate</code></td>
+  <td>false</td>
+  <td>
+    Whether Spark authenticates its internal connections. See
+    <code>spark.authenticate.secret</code> if not running on YARN.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.authenticate.secret</code></td>
+  <td>None</td>
+  <td>
+    Set the secret key used for Spark to authenticate between components. This needs to be set if
+    not running on YARN and authentication is enabled.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.core.connection.ack.wait.timeout</code></td>
+  <td>60</td>
+  <td>
+    Number of seconds for the connection to wait for ack to occur before timing
+    out and giving up. To avoid unwilling timeout caused by long pause like GC,
+    you can set larger value.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.core.connection.auth.wait.timeout</code></td>
+  <td>30</td>
+  <td>
+    Number of seconds for the connection to wait for authentication to occur before timing
+    out and giving up.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.modify.acls</code></td>
+  <td>Empty</td>
+  <td>
+    Comma separated list of users that have modify access to the Spark job. By default only the
+    user that started the Spark job has access to modify it (kill it for example).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.filters</code></td>
+  <td>None</td>
+  <td>
+    Comma separated list of filter class names to apply to the Spark web UI. The filter should be a
+    standard <a href="http://docs.oracle.com/javaee/6/api/javax/servlet/Filter.html">
+    javax servlet Filter</a>. Parameters to each filter can also be specified by setting a
+    java system property of: <br />
+    <code>spark.&lt;class name of filter&gt;.params='param1=value1,param2=value2'</code><br />
+    For example: <br />
+    <code>-Dspark.ui.filters=com.test.filter1</code> <br />
+    <code>-Dspark.com.test.filter1.params='param1=foo,param2=testing'</code>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.ui.view.acls</code></td>
+  <td>Empty</td>
+  <td>
+    Comma separated list of users that have view access to the Spark web ui. By default only the
+    user that started the Spark job has view access.
+  </td>
+</tr>
+</table>
+
+<h4 id="encryption">Encryption</h4>
+
+<table class="table">
+    <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+    <tr>
+        <td><code>spark.ssl.enabled</code></td>
+        <td>false</td>
+        <td>
+            <p>Whether to enable SSL connections on all supported protocols.</p>
+
+            <p>All the SSL settings like <code>spark.ssl.xxx</code> where <code>xxx</code> is a
+            particular configuration property, denote the global configuration for all the supported
+            protocols. In order to override the global configuration for the particular protocol,
+            the properties must be overwritten in the protocol-specific namespace.</p>
+
+            <p>Use <code>spark.ssl.YYY.XXX</code> settings to overwrite the global configuration for
+            particular protocol denoted by <code>YYY</code>. Currently <code>YYY</code> can be
+            either <code>akka</code> for Akka based connections or <code>fs</code> for broadcast and
+            file server.</p>
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.enabledAlgorithms</code></td>
+        <td>Empty</td>
+        <td>
+            A comma separated list of ciphers. The specified ciphers must be supported by JVM.
+            The reference list of protocols one can find on
+            <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">this</a>
+            page.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.keyPassword</code></td>
+        <td>None</td>
+        <td>
+            A password to the private key in key-store.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.keyStore</code></td>
+        <td>None</td>
+        <td>
+            A path to a key-store file. The path can be absolute or relative to the directory where
+            the component is started in.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.keyStorePassword</code></td>
+        <td>None</td>
+        <td>
+            A password to the key-store.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.protocol</code></td>
+        <td>None</td>
+        <td>
+            A protocol name. The protocol must be supported by JVM. The reference list of protocols
+            one can find on <a href="https://blogs.oracle.com/java-platform-group/entry/diagnosing_tls_ssl_and_https">this</a>
+            page.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.trustStore</code></td>
+        <td>None</td>
+        <td>
+            A path to a trust-store file. The path can be absolute or relative to the directory
+            where the component is started in.
+        </td>
+    </tr>
+    <tr>
+        <td><code>spark.ssl.trustStorePassword</code></td>
+        <td>None</td>
+        <td>
+            A password to the trust-store.
+        </td>
+    </tr>
+</table>
+
+<h4 id="spark-streaming">Spark Streaming</h4>
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.streaming.blockInterval</code></td>
+  <td>200</td>
+  <td>
+    Interval (milliseconds) at which data received by Spark Streaming receivers is chunked
+    into blocks of data before storing them in Spark. Minimum recommended - 50 ms. See the
+    <a href="streaming-programming-guide.html#level-of-parallelism-in-data-receiving">performance
+     tuning</a> section in the Spark Streaming programing guide for more details.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.receiver.maxRate</code></td>
+  <td>not set</td>
+  <td>
+    Maximum rate (number of records per second) at which each receiver will receive data.
+    Effectively, each stream will consume at most this number of records per second.
+    Setting this configuration to 0 or a negative number will put no limit on the rate.
+    See the <a href="streaming-programming-guide.html#deploying-applications">deployment guide</a>
+    in the Spark Streaming programing guide for mode details.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.receiver.writeAheadLog.enable</code></td>
+  <td>false</td>
+  <td>
+    Enable write ahead logs for receivers. All the input data received through receivers
+    will be saved to write ahead logs that will allow it to be recovered after driver failures.
+    See the <a href="streaming-programming-guide.html#deploying-applications">deployment guide</a>
+    in the Spark Streaming programing guide for more details.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.unpersist</code></td>
+  <td>true</td>
+  <td>
+    Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from
+    Spark's memory. The raw input data received by Spark Streaming is also automatically cleared.
+    Setting this to false will allow the raw data and persisted RDDs to be accessible outside the
+    streaming application as they will not be cleared automatically. But it comes at the cost of
+    higher memory usage in Spark.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.kafka.maxRatePerPartition</code></td>
+  <td>not set</td>
+  <td>
+    Maximum rate (number of records per second) at which data will be read from each Kafka
+    partition when using the new Kafka direct stream API. See the
+    <a href="streaming-kafka-integration.html">Kafka Integration guide</a>
+    for more details.
+  </td>
+</tr>
+</table>
+
+<h4 id="cluster-managers">Cluster Managers</h4>
+<p>Each cluster manager in Spark has additional configuration options. Configurations
+can be found on the pages for each mode:</p>
+
+<ul>
+  <li><a href="running-on-yarn.html#configuration">YARN</a></li>
+  <li><a href="running-on-mesos.html">Mesos</a></li>
+  <li><a href="spark-standalone.html#cluster-launch-scripts">Standalone Mode</a></li>
+</ul>
+
+<h1 id="environment-variables">Environment Variables</h1>
+
+<p>Certain Spark settings can be configured through environment variables, which are read from the
+<code>conf/spark-env.sh</code> script in the directory where Spark is installed (or <code>conf/spark-env.cmd</code> on
+Windows). In Standalone and Mesos modes, this file can give machine specific information such as
+hostnames. It is also sourced when running local Spark applications or submission scripts.</p>
+
+<p>Note that <code>conf/spark-env.sh</code> does not exist by default when Spark is installed. However, you can
+copy <code>conf/spark-env.sh.template</code> to create it. Make sure you make the copy executable.</p>
+
+<p>The following variables can be set in <code>spark-env.sh</code>:</p>
+
+<table class="table">
+  <tr><th style="width:21%">Environment Variable</th><th>Meaning</th></tr>
+  <tr>
+    <td><code>JAVA_HOME</code></td>
+    <td>Location where Java is installed (if it's not on your default `PATH`).</td>
+  </tr>
+  <tr>
+    <td><code>PYSPARK_PYTHON</code></td>
+    <td>Python binary executable to use for PySpark.</td>
+  </tr>
+  <tr>
+    <td><code>SPARK_LOCAL_IP</code></td>
+    <td>IP address of the machine to bind to.</td>
+  </tr>
+  <tr>
+    <td><code>SPARK_PUBLIC_DNS</code></td>
+    <td>Hostname your Spark program will advertise to other machines.</td>
+  </tr>
+</table>
+
+<p>In addition to the above, there are also options for setting up the Spark
+<a href="spark-standalone.html#cluster-launch-scripts">standalone cluster scripts</a>, such as number of cores
+to use on each machine and maximum memory.</p>
+
+<p>Since <code>spark-env.sh</code> is a shell script, some of these can be set programmatically &#8211; for example, you might
+compute <code>SPARK_LOCAL_IP</code> by looking up the IP of a specific network interface.</p>
+
+<h1 id="configuring-logging">Configuring Logging</h1>
+
+<p>Spark uses <a href="http://logging.apache.org/log4j/">log4j</a> for logging. You can configure it by adding a
+<code>log4j.properties</code> file in the <code>conf</code> directory. One way to start is to copy the existing
+<code>log4j.properties.template</code> located there.</p>
+
+<h1 id="overriding-configuration-directory">Overriding configuration directory</h1>
+
+<p>To specify a different configuration directory other than the default &#8220;SPARK_HOME/conf&#8221;,
+you can set SPARK_CONF_DIR. Spark will use the the configuration files (spark-defaults.conf, spark-env.sh, log4j.properties, etc)
+from this directory.</p>
+
+
+
+        </div> <!-- /container -->
+
+        <script src="js/vendor/jquery-1.8.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/main.js"></script>
+
+        <!-- MathJax Section -->
+        <script type="text/x-mathjax-config">
+            MathJax.Hub.Config({
+                TeX: { equationNumbers: { autoNumber: "AMS" } }
+            });
+        </script>
+        <script>
+            // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
+            // We could use "//cdn.mathjax...", but that won't support "file://".
+            (function(d, script) {
+                script = d.createElement('script');
+                script.type = 'text/javascript';
+                script.async = true;
+                script.onload = function(){
+                    MathJax.Hub.Config({
+                        tex2jax: {
+                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+                            processEscapes: true,
+                            skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+                        }
+                    });
+                };
+                script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
+                    'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+                d.getElementsByTagName('head')[0].appendChild(script);
+            }(document));
+        </script>
+    </body>
+</html>
author	Patrick Wendell <pwendell@apache.org>	2015-04-17 05:52:53 +0000
committer	Patrick Wendell <pwendell@apache.org>	2015-04-17 05:52:53 +0000
commit	d49935af658ee525496c7ac598f7352c7793d6f8 (patch)
tree	1813e84df9974a07d595ba24f0fe0e832f182f05 /site/docs/1.3.1/configuration.html
parent	9695c4519404b689f29bc0fbc1a2eefa3eb33806 (diff)
download	spark-website-d49935af658ee525496c7ac598f7352c7793d6f8.tar.gz spark-website-d49935af658ee525496c7ac598f7352c7793d6f8.tar.bz2 spark-website-d49935af658ee525496c7ac598f7352c7793d6f8.zip