aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--project/SparkBuild.scala1
-rw-r--r--sql/README.md45
2 files changed, 11 insertions, 35 deletions
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index fbc8983b95..93698efe84 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -245,6 +245,7 @@ object SQL {
|import org.apache.spark.sql.catalyst.plans.logical._
|import org.apache.spark.sql.catalyst.rules._
|import org.apache.spark.sql.catalyst.util._
+ |import org.apache.spark.sql.Dsl._
|import org.apache.spark.sql.execution
|import org.apache.spark.sql.test.TestSQLContext._
|import org.apache.spark.sql.types._
diff --git a/sql/README.md b/sql/README.md
index 61a20916a9..254ab8eb63 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -22,59 +22,34 @@ export HADOOP_HOME="<path to>/hadoop-1.0.4"
Using the console
=================
-An interactive scala console can be invoked by running `build/sbt hive/console`. From here you can execute queries and inspect the various stages of query optimization.
+An interactive scala console can be invoked by running `build/sbt hive/console`.
+From here you can execute queries with HiveQl and manipulate DataFrame by using DSL.
```scala
catalyst$ build/sbt hive/console
[info] Starting scala interpreter...
-import org.apache.spark.sql.catalyst.analysis._
-import org.apache.spark.sql.catalyst.dsl._
-import org.apache.spark.sql.catalyst.errors._
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution
+import org.apache.spark.sql.Dsl._
import org.apache.spark.sql.hive._
-import org.apache.spark.sql.hive.TestHive._
+import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.types._
+import org.apache.spark.sql.parquet.ParquetTestData
Welcome to Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45).
Type in expressions to have them evaluated.
Type :help for more information.
scala> val query = sql("SELECT * FROM (SELECT * FROM src) a")
-query: org.apache.spark.sql.DataFrame =
-== Query Plan ==
-== Physical Plan ==
-HiveTableScan [key#10,value#11], (MetastoreRelation default, src, None), None
+query: org.apache.spark.sql.DataFrame = org.apache.spark.sql.DataFrame@74448eed
```
-Query results are RDDs and can be operated as such.
+Query results are `DataFrames` and can be operated as such.
```
scala> query.collect()
res2: Array[org.apache.spark.sql.Row] = Array([238,val_238], [86,val_86], [311,val_311], [27,val_27]...
```
-You can also build further queries on top of these RDDs using the query DSL.
+You can also build further queries on top of these `DataFrames` using the query DSL.
```
-scala> query.where('key === 100).collect()
-res3: Array[org.apache.spark.sql.Row] = Array([100,val_100], [100,val_100])
-```
-
-From the console you can even write rules that transform query plans. For example, the above query has redundant project operators that aren't doing anything. This redundancy can be eliminated using the `transform` function that is available on all [`TreeNode`](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala) objects.
-```scala
-scala> query.queryExecution.analyzed
-res4: org.apache.spark.sql.catalyst.plans.logical.LogicalPlan =
-Project [key#10,value#11]
- Project [key#10,value#11]
- MetastoreRelation default, src, None
-
-
-scala> query.queryExecution.analyzed transform {
- | case Project(projectList, child) if projectList == child.output => child
- | }
-res5: res17: org.apache.spark.sql.catalyst.plans.logical.LogicalPlan =
-Project [key#10,value#11]
- MetastoreRelation default, src, None
+scala> query.where('key > 30).select(avg('key)).collect()
+res3: Array[org.apache.spark.sql.Row] = Array([274.79025423728814])
```