[SPARK-5643][SQL] Add a show method to print the content of a DataFrame in tabular format.

An example: ``` year month AVG('Adj Close) MAX('Adj Close) 1980 12 0.503218 0.595103 1981 01 0.523289 0.570307 1982 02 0.436504 0.475256 1983 03 0.410516 0.442194 1984 04 0.450090 0.483521 ``` Author: Reynold Xin <rxin@databricks.com> Closes #4416 from rxin/SPARK-5643 and squashes the following commits: d0e0d6e [Reynold Xin] [SQL] Minor update to data source and statistics documentation. 269da83 [Reynold Xin] Updated isLocal comment. 2cf3c27 [Reynold Xin] Moved logic into optimizer. 1a04d8b [Reynold Xin] [SPARK-5643][SQL] Add a show method to print the content of a DataFrame in columnar format.
author: Reynold Xin <rxin@databricks.com> 2015-02-08 18:56:51 -0800
committer: Reynold Xin <rxin@databricks.com> 2015-02-08 18:56:51 -0800
commit: a052ed42501fee3641348337505b6176426653c4 (patch)
tree: e708efe88401e54d61374b20e588b266c7836050 /sql/catalyst
parent: 56aff4bd6c7c9d18f4f962025708f20a4a82dcf0 (diff)
download: spark-a052ed42501fee3641348337505b6176426653c4.tar.gz
spark-a052ed42501fee3641348337505b6176426653c4.tar.bz2
spark-a052ed42501fee3641348337505b6176426653c4.zip
3 files changed, 79 insertions, 3 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 8c8f2896eb..3bc48c95c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -50,7 +50,9 @@ object DefaultOptimizer extends Optimizer {
       CombineFilters,
       PushPredicateThroughProject,
       PushPredicateThroughJoin,
-      ColumnPruning) :: Nil
+      ColumnPruning) ::
+    Batch("LocalRelation", FixedPoint(100),
+      ConvertToLocalRelation) :: Nil
 }
 
 /**
@@ -610,3 +612,17 @@ object DecimalAggregates extends Rule[LogicalPlan] {
         DecimalType(prec + 4, scale + 4))
   }
 }
+
+/**
+ * Converts local operations (i.e. ones that don't require data exchange) on LocalRelation to
+ * another LocalRelation.
+ *
+ * This is relatively simple as it currently handles only a single case: Project.
+ */
+object ConvertToLocalRelation extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case Project(projectList, LocalRelation(output, data)) =>
+      val projection = new InterpretedProjection(projectList, output)
+      LocalRelation(projectList.map(_.toAttribute), data.map(projection))
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 8d30528328..7cf4b81274 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -29,12 +29,15 @@ import org.apache.spark.sql.catalyst.trees
 /**
  * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
  * corresponding statistic produced by the children.  To override this behavior, override
- * `statistics` and assign it an overriden version of `Statistics`.
+ * `statistics` and assign it an overridden version of `Statistics`.
  *
- * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
+ * '''NOTE''': concrete and/or overridden versions of statistics fields should pay attention to the
  * performance of the implementations.  The reason is that estimations might get triggered in
  * performance-critical processes, such as query plan planning.
  *
+ * Note that we are using a BigInt here since it is easy to overflow a 64-bit integer in
+ * cardinality estimation (e.g. cartesian joins).
+ *
  * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
  *                    defaults to the product of children's `sizeInBytes`.
  */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
new file mode 100644
index 0000000000..cf42d43823
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ConvertToLocalRelationSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+
+class ConvertToLocalRelationSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("LocalRelation", FixedPoint(100),
+        ConvertToLocalRelation) :: Nil
+  }
+
+  test("Project on LocalRelation should be turned into a single LocalRelation") {
+    val testRelation = LocalRelation(
+      LocalRelation('a.int, 'b.int).output,
+      Row(1, 2) ::
+      Row(4, 5) :: Nil)
+
+    val correctAnswer = LocalRelation(
+      LocalRelation('a1.int, 'b1.int).output,
+      Row(1, 3) ::
+      Row(4, 6) :: Nil)
+
+    val projectOnLocal = testRelation.select(
+      UnresolvedAttribute("a").as("a1"),
+      (UnresolvedAttribute("b") + 1).as("b1"))
+
+    val optimized = Optimize(projectOnLocal.analyze)
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+}
author	Reynold Xin <rxin@databricks.com>	2015-02-08 18:56:51 -0800
committer	Reynold Xin <rxin@databricks.com>	2015-02-08 18:56:51 -0800
commit	a052ed42501fee3641348337505b6176426653c4 (patch)
tree	e708efe88401e54d61374b20e588b266c7836050 /sql/catalyst
parent	56aff4bd6c7c9d18f4f962025708f20a4a82dcf0 (diff)
download	spark-a052ed42501fee3641348337505b6176426653c4.tar.gz spark-a052ed42501fee3641348337505b6176426653c4.tar.bz2 spark-a052ed42501fee3641348337505b6176426653c4.zip