summaryrefslogblamecommitdiff
path: root/site/sql/index.html
blob: 46a42026952d5966077b4c1333b3ab8351112e12 (plain) (tree)

















































                                                                                                                     




                                                                                     
































                                                                                      
                                         
















                                                                   
                                                                           

             
                                                    
                           
                                                                                 






                                                                                                              

                                                                                                                














                                                   


                                                                                                                           


                                                                                                                                


                                                                                                                                   


                                                                                







                                                                                                      
                           

                              
























































































































































































                                                                                                                                                         

       
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">

  <title>
     Spark SQL | Apache Spark
    
  </title>

  

  <!-- Bootstrap core CSS -->
  <link href="/css/cerulean.min.css" rel="stylesheet">
  <link href="/css/custom.css" rel="stylesheet">

  <script type="text/javascript">
  <!-- Google Analytics initialization -->
  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-32518208-2']);
  _gaq.push(['_trackPageview']);
  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();

  <!-- Adds slight delay to links to allow async reporting -->
  function trackOutboundLink(link, category, action) {
    try {
      _gaq.push(['_trackEvent', category , action]);
    } catch(err){}

    setTimeout(function() {
      document.location.href = link.href;
    }, 100);
  }
  </script>

  <!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
  <!--[if lt IE 9]>
  <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
  <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
  <![endif]-->
</head>

<body>

<script src="https://code.jquery.com/jquery.js"></script>
<script src="//netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script>
<script src="/js/lang-tabs.js"></script>
<script src="/js/downloads.js"></script>

<div class="container" style="max-width: 1200px;">

<div class="masthead">
  
    <p class="lead">
      <a href="/">
      <img src="/images/spark-logo.png"
      style="height:100px; width:auto; vertical-align: bottom; margin-top: 20px;"></a>
      <a href="#"><span class="subproject">
        SQL
      </span></a>
    </p>
  
</div>

<nav class="navbar navbar-default" role="navigation">
  <!-- Brand and toggle get grouped for better mobile display -->
  <div class="navbar-header">
    <button type="button" class="navbar-toggle" data-toggle="collapse"
            data-target="#navbar-collapse-1">
      <span class="sr-only">Toggle navigation</span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
      <span class="icon-bar"></span>
    </button>
  </div>

  <!-- Collect the nav links, forms, and other content for toggling -->
  <div class="collapse navbar-collapse" id="navbar-collapse-1">
    <ul class="nav navbar-nav">
      <li><a href="/downloads.html">Download</a></li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Libraries <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          
          <li><a href="/">Apache Spark</a></li>
          
          <li><a href="/sql/">Spark SQL</a></li>
          <li><a href="/streaming/">Spark Streaming</a></li>
          <li><a href="/mllib/">MLlib (machine learning)</a></li>
          <li><a href="/graphx/">GraphX (graph)</a></li>
        </ul>
      </li>
      <li class="dropdown">
        <a href="#" class="dropdown-toggle" data-toggle="dropdown">
          Documentation <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/documentation.html">Overview</a></li>
          <li><a href="/docs/latest/">Latest Release (Spark 1.1.0)</a></li>
        </ul>
      </li>
      <li><a href="/examples.html">Examples</a></li>
      <li class="dropdown">
        <a href="/community.html" class="dropdown-toggle" data-toggle="dropdown">
          Community <b class="caret"></b>
        </a>
        <ul class="dropdown-menu">
          <li><a href="/community.html">Mailing Lists</a></li>
          <li><a href="/community.html#events">Events and Meetups</a></li>
          <li><a href="/community.html#history">Project History</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Powered+By+Spark">Powered By</a></li>
          <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Committers">Project Committers</a></li>
          <li><a href="https://issues.apache.org/jira/browse/SPARK">Issue Tracker</a></li>
        </ul>
      </li>
      <li><a href="/faq.html">FAQ</a></li>
    </ul>
  </div>
  <!-- /.navbar-collapse -->
</nav>


<div class="row">
  <div class="col-md-3 col-md-push-9">
    <div class="news" style="margin-bottom: 20px;">
      <h5>Latest News</h5>
      <ul class="list-unstyled">
        
          <li><a href="/news/registration-open-for-spark-summit-east.html">Registration open for Spark Summit East 2015</a>
          <span class="small">(Nov 26, 2014)</span></li>
        
          <li><a href="/news/spark-wins-daytona-gray-sort-100tb-benchmark.html">Spark wins Daytona Gray Sort 100TB Benchmark</a>
          <span class="small">(Nov 05, 2014)</span></li>
        
          <li><a href="/news/proposals-open-for-spark-summit-east.html">Submissions open for Spark Summit East 2015 in New York</a>
          <span class="small">(Oct 18, 2014)</span></li>
        
          <li><a href="/news/spark-1-1-0-released.html">Spark 1.1.0 released</a>
          <span class="small">(Sep 11, 2014)</span></li>
        
      </ul>
      <p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
    </div>
    <div class="hidden-xs hidden-sm">
      <a href="/downloads.html" class="btn btn-success btn-lg btn-block" style="margin-bottom: 30px;">
        Download Spark
      </a>
      <p style="font-size: 16px; font-weight: 500; color: #555;">
        Built-in Libraries:
      </p>
      <ul class="list-narrow">
        <li><a href="/sql/">Spark SQL</a></li>
        <li><a href="/streaming/">Spark Streaming</a></li>
        <li><a href="/mllib/">MLlib (machine learning)</a></li>
        <li><a href="/graphx/">GraphX (graph)</a></li>
      </ul>
    </div>
  </div>

  <div class="col-md-9 col-md-pull-3">
    <div class="jumbotron">
  <b>Spark SQL</b> unifies access to structured data.
</div>

<div class="row row-padded">
  <div class="col-md-7 col-sm-7">
    <h2>Integrated</h2>
    <p class="lead">
	  Seemlessly mix SQL queries with Spark programs.
    </p>
    <p>
	  Spark SQL lets you query structured data as a distributed dataset (RDD) in Spark, with integrated APIs in Python, Scala and Java. 
	  This tight integration makes it easy to run SQL queries alongside complex analytic algorithms.
    </p>
  </div>
  <div class="col-md-5 col-sm-5 col-padded-top col-center">

    <div style="margin-top: 15px; text-align: left; display: inline-block;">
      <div class="code">
	    sqlCtx = new <span class="sparkop">HiveContext</span>(sc)<br />
		results = sqlCtx.<span class="sparkop">sql</span>(<br />&nbsp;&nbsp;<span class="closure">"SELECT * FROM people"</span>)<br />
		names = results.<span class="sparkop">map</span>(<span class="closure">lambda p: p.name</span>)
	  </div>
      <div class="caption">Apply functions to results of SQL queries.</div>
    </div>
  </div>
</div>

<div class="row row-padded">
  <div class="col-md-7 col-sm-7">
    <h2>Unified Data Access</h2>
    <p class="lead">
      Load and query data from a variety of sources.
    </p>
    <p>
      SchemaRDDs provide a single interface for efficiently working with structured data, including Apache Hive tables, parquet files and JSON files.
    </p>
  </div>
  <div class="col-md-5 col-sm-5 col-padded-top col-center">
    <div style="margin-top: 15px; text-align: left; display: inline-block;">
      <div class="code">
		sqlCtx.<span class="sparkop">jsonFile</span>(<span class="closure">"s3n://..."</span>)<br />&nbsp;&nbsp;.registerAsTable("json")<br />
		schema_rdd = sqlCtx.<span class="sparkop">sql</span>(<span class="closure">"""<br />
			&nbsp;&nbsp;SELECT * <br />
			&nbsp;&nbsp;FROM hiveTable<br />
			&nbsp;&nbsp;JOIN json ..."""</span>)<br />
	  </div>
      <div class="caption">Query and join different data sources.</div>
    </div>
  </div>
</div>

<div class="row row-padded">
  <div class="col-md-7 col-sm-7">
    <h2>Hive Compatibility</h2>
    <p class="lead">
      Run unmodified Hive queries on existing warehouses.
    </p>
    <p>
      Spark SQL reuses the Hive frontend and metastore, giving you full compatibility with
      existing Hive data, queries, and UDFs. Simply install it alongside Hive.
    </p>
  </div>
  <div class="col-md-5 col-sm-5 col-padded-top col-center">
    <div style="width: 100%; max-width: 323px; display: inline-block">
      <img src="/images/sql-hive-arch.png" style="width: 100%; max-width: 323px;" />
      <div class="caption">Spark SQL can use existing Hive metastores, SerDes, and UDFs.</div>
    </div>
  </div>
</div>

<div class="row row-padded">
  <div class="col-md-7 col-sm-7">
    <h2>Standard Connectivity</h2>
    <p class="lead">
      Connect through JDBC or ODBC.
    </p>
    <p>
      Spark SQL includes a server mode with industry standard JDBC and ODBC connectivity.
    </p>
  </div>
  <div class="col-md-5 col-sm-5 col-padded-top col-center">
    <div style="width: 100%; max-width: 323px; display: inline-block">
      <img src="/images/jdbc.png" style="width: 75%; max-width: 323px;" />
      <div class="caption">Use your existing BI tools to query big data.</div>
    </div>
  </div>
</div>

<!--
<div class="row row-padded">
  <div class="col-md-7 col-sm-7">
    <h2>Speed</h2>
    <p class="lead">
      Optimized to execute on Spark.
    </p>
    <p>
      Spark SQL was built using the Catalyst optimizer, which automatically rewrites your queries to execute more efficiently.
  	  By leveraging advanced techniques like runtime code generation, Spark SQL makes it easier to write lightning-fast analytic applications.
    </p>
  </div>
  <div class="col-md-5 col-sm-5 col-padded-top col-center">
    <div style="width: 100%; max-width: 272px; display: inline-block; text-align: center;">
      <img src="/images/sqlperf.png" style="width: 100%; max-width: 250px;">
      <div class="caption" style="min-width: 272px;">Performance comparison between Shark and Spark SQL</div>
    </div>
  </div>
</div>
-->


  </div>
</div>


  
<div class="row">
  <div class="col-md-4 col-padded">
    <h3>Scalability</h3>
    <p>
  	  Use the same engine for both interactive and long queries.		
    </p>
	<p>
      Spark SQL takes advantage of the RDD model to support mid-query fault tolerance, letting it scale to large jobs too.
	  Don't worry about using a different engine for historical data.
    </p>
  </div>

  <div class="col-md-4 col-padded">
    <h3>Community</h3>
    <p>
      Spark SQL is developed as part of Apache Spark. It thus gets
      tested and updated with each Spark release.
    </p>
    <p>
      If you have questions about the system, ask on the
      <a href="/community.html#mailing-lists">Spark mailing lists</a>.
    </p>
    <p>
      The Spark SQL developers welcome contributions. If you'd like to help out,
      read <a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">how to
      contribute to Spark</a>, and send us a patch!
    </p>
  </div>

  <div class="col-md-4 col-padded">
    <h3>Getting Started</h3>
    <p>
      To get started with Spark SQL:
    </p>
    <ul class="list-narrow">
      <li><a href="/downloads.html">Download Spark</a>. It includes Spark SQL as a module.</li>
      <li>Read the <a href="/docs/latest/sql-programming-guide.html">Spark SQL programming guide</a>, which includes a examples of common use cases.</li>
    </ul>
  </div>
</div>

<div class="row">
  <div class="col-sm-12 col-center">
    <a href="/downloads.html" class="btn btn-success btn-lg btn-multiline">
      Download Spark<br /><span class="small">Includes Spark SQL</span>
    </a>
  </div>
</div>




<footer class="small">
  <hr>
  Apache Spark, Spark, Apache, and the Spark logo are trademarks of
  <a href="http://www.apache.org">The Apache Software Foundation</a>.
</footer>

</div>

</body>
</html>