4 files changed, 166 insertions, 0 deletions
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 49fd78ca98..5d4dbb7a9c 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -66,6 +66,7 @@
                                 <li><a href="python-programming-guide.html">Spark in Python</a></li>
                                 <li class="divider"></li>
                                 <li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
+                                <li><a href="sql-programming-guide.html">Spark SQL</a></li>
                                 <li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
                                 <li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
                                 <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
@@ -79,6 +80,14 @@
                                 <li><a href="api/pyspark/index.html">Spark Core for Python</a></li>
                                 <li class="divider"></li>
                                 <li><a href="api/streaming/index.html#org.apache.spark.streaming.package">Spark Streaming</a></li>
+                                <li class="dropdown-submenu">
+                                    <a tabindex="-1" href="#">Spark SQL</a>
+                                    <ul class="dropdown-menu">
+                                        <li><a href="api/sql/core/org/apache/spark/sql/SQLContext.html">Spark SQL Core</a></li>
+                                        <li><a href="api/sql/hive/org/apache/spark/sql/hive/package.html">Hive Support</a></li>
+                                        <li><a href="api/sql/catalyst/org/apache/spark/sql/catalyst/package.html">Catalyst (Optimization)</a></li>
+                                    </ul>
+                                </li>
                                 <li><a href="api/mllib/index.html#org.apache.spark.mllib.package">MLlib (Machine Learning)</a></li>
                                 <li><a href="api/bagel/index.html#org.apache.spark.bagel.package">Bagel (Pregel on Spark)</a></li>
                                 <li><a href="api/graphx/index.html#org.apache.spark.graphx.package">GraphX (Graph Processing)</a></li>
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 44d64057f4..2245bcbc70 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -22,6 +22,7 @@ if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1')
   # Build Scaladoc for Java/Scala
   core_projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"]
   external_projects = ["flume", "kafka", "mqtt", "twitter", "zeromq"]
+  sql_projects = ["catalyst", "core", "hive"]
 
   projects = core_projects + external_projects.map { |project_name| "external/" + project_name }
 
@@ -49,6 +50,18 @@ if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1')
     cp_r(source + "/.", dest)
   end
 
+  sql_projects.each do |project_name|
+    source = "../sql/" + project_name + "/target/scala-2.10/api/"
+    dest = "api/sql/" + project_name
+
+    puts "echo making directory " + dest
+    mkdir_p dest
+
+    # From the rubydoc: cp_r('src', 'dest') makes src/dest, but this doesn't.
+    puts "cp -r " + source + "/. " + dest
+    cp_r(source + "/.", dest)
+  end
+
   # Build Epydoc for Python
   puts "Moving to python directory and building epydoc."
   cd("../python")
diff --git a/docs/index.md b/docs/index.md
index 23311101e1..7a13fa9a9a 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -78,6 +78,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
   * [Java Programming Guide](java-programming-guide.html): using Spark from Java
   * [Python Programming Guide](python-programming-guide.html): using Spark from Python
 * [Spark Streaming](streaming-programming-guide.html): Spark's API for processing data streams
+* [Spark SQL](sql-programming-guide.html): Support for running relational queries on Spark
 * [MLlib (Machine Learning)](mllib-guide.html): Spark's built-in machine learning library
 * [Bagel (Pregel on Spark)](bagel-programming-guide.html): simple graph processing model
 * [GraphX (Graphs on Spark)](graphx-programming-guide.html): Spark's new API for graphs
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
new file mode 100644
index 0000000000..b6f21a5dc6
--- /dev/null
+++ b/docs/sql-programming-guide.md
@@ -0,0 +1,143 @@
+---
+layout: global
+title: Spark SQL Programming Guide
+---
+**Spark SQL is currently an Alpha component. Therefore, the APIs may be changed in future releases.**
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+# Overview
+Spark SQL allows relational queries expressed in SQL, HiveQL, or Scala to be executed using
+Spark.  At the core of this component is a new type of RDD,
+[SchemaRDD](api/sql/core/index.html#org.apache.spark.sql.SchemaRDD).  SchemaRDDs are composed
+[Row](api/sql/catalyst/index.html#org.apache.spark.sql.catalyst.expressions.Row) objects along with
+a schema that describes the data types of each column in the row.  A SchemaRDD is similar to a table
+in a traditional relational database.  A SchemaRDD can be created from an existing RDD, parquet
+file, or by running HiveQL against data stored in [Apache Hive](http://hive.apache.org/).
+
+**All of the examples on this page use sample data included in the Spark distribution and can be run in the spark-shell.**
+
+***************************************************************************************************
+
+# Getting Started
+
+The entry point into all relational functionallity in Spark is the
+[SQLContext](api/sql/core/index.html#org.apache.spark.sql.SQLContext) class, or one of its
+decendents.  To create a basic SQLContext, all you need is a SparkContext.
+
+{% highlight scala %}
+val sc: SparkContext // An existing SparkContext.
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+
+// Importing the SQL context gives access to all the public SQL functions and implicit conversions.
+import sqlContext._
+{% endhighlight %}
+
+## Running SQL on RDDs
+One type of table that is supported by Spark SQL is an RDD of Scala case classetees.  The case class
+defines the schema of the table.  The names of the arguments to the case class are read using
+reflection and become the names of the columns. Case classes can also be nested or contain complex
+types such as Sequences or Arrays. This RDD can be implicitly converted to a SchemaRDD and then be
+registered as a table.  Tables can used in subsequent SQL statements.
+
+{% highlight scala %}
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+
+// Define the schema using a case class.
+case class Person(name: String, age: Int)
+
+// Create an RDD of Person objects and register it as a table.
+val people = sc.textFile("examples/src/main/resources/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
+people.registerAsTable("people")
+
+// SQL statements can be run by using the sql methods provided by sqlContext.
+val teenagers = sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
+
+// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
+// The columns of a row in the result can be accessed by ordinal.
+teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
+{% endhighlight %}
+
+**Note that Spark SQL currently uses a very basic SQL parser, and the keywords are case sensitive.**
+Users that want a more complete dialect of SQL should look at the HiveQL support provided by
+`HiveContext`.
+
+## Using Parquet
+
+Parquet is a columnar format that is supported by many other data processing systems.  Spark SQL
+provides support for both reading and writing parquet files that automatically preserves the schema
+of the original data.  Using the data from the above example:
+
+{% highlight scala %}
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+
+val people: RDD[Person] // An RDD of case class objects, from the previous example.
+
+// The RDD is implicitly converted to a SchemaRDD, allowing it to be stored using parquet.
+people.saveAsParquetFile("people.parquet")
+
+// Read in the parquet file created above.  Parquet files are self-describing so the schema is preserved.
+// The result of loading a parquet file is also a SchemaRDD.
+val parquetFile = sqlContext.parquetFile("people.parquet")
+
+//Parquet files can also be registered as tables and then used in SQL statements.
+parquetFile.registerAsTable("parquetFile")
+val teenagers = sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
+teenagers.collect().foreach(println)
+{% endhighlight %}
+
+## Writing Language-Integrated Relational Queries
+
+Spark SQL also supports a domain specific language for writing queries.  Once again,
+using the data from the above examples:
+
+{% highlight scala %}
+val sqlContext = new org.apache.spark.sql.SQLContext(sc)
+import sqlContext._
+val people: RDD[Person] // An RDD of case class objects, from the first example.
+
+// The following is the same as 'SELECT name FROM people WHERE age >= 10 AND age <= 19'
+val teenagers = people.where('age >= 10).where('age <= 19).select('name)
+{% endhighlight %}
+
+The DSL uses Scala symbols to represent columns in the underlying table, which are identifiers
+prefixed with a tick (`'`).  Implicit conversions turn these symbols into expressions that are
+evaluated by the SQL execution engine.  A full list of the functions supported can be found in the
+[ScalaDoc](api/sql/core/index.html#org.apache.spark.sql.SchemaRDD).
+
+<!-- TODO: Include the table of operations here. -->
+
+# Hive Support
+
+Spark SQL also supports reading and writing data stored in [Apache Hive](http://hive.apache.org/).
+However, since Hive has a large number of dependencies, it is not included in the default Spark assembly.
+In order to use Hive you must first run '`sbt/sbt hive/assembly`'.  This command builds a new assembly
+jar that includes Hive.  When this jar is present, Spark will use the Hive
+assembly instead of the normal Spark assembly.  Note that this Hive assembly jar must also be present
+on all of the worker nodes, as they will need access to the Hive serialization and deserialization libraries
+(SerDes) in order to acccess data stored in Hive.
+
+Configuration of Hive is done by placing your `hive-site.xml` file in `conf/`.
+
+When working with Hive one must construct a `HiveContext`, which inherits from `SQLContext`, and
+adds support for finding tables in in the MetaStore and writing queries using HiveQL. Users who do
+not have an existing Hive deployment can also experiment with the `LocalHiveContext`,
+which is similar to `HiveContext`, but creates a local copy of the `metastore` and `warehouse`
+automatically.
+
+{% highlight scala %}
+val sc: SparkContext // An existing SparkContext.
+val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
+
+// Importing the SQL context gives access to all the public SQL functions and implicit conversions.
+import hiveContext._
+
+sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
+sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
+
+// Queries are expressed in HiveQL
+sql("SELECT key, value FROM src").collect().foreach(println)
+{% endhighlight %}
+\ No newline at end of file