SPARK-2186: Spark SQL DSL support for simple aggregations such as SUM and AVG

**Description** This patch enables using the `.select()` function in SchemaRDD with functions such as `Sum`, `Count` and other. **Testing** Unit tests added. Author: Ximo Guanter Gonzalbez <ximo@tid.es> Closes #1211 from edrevo/add-expression-support-in-select and squashes the following commits: fe4a1e1 [Ximo Guanter Gonzalbez] Extend SQL DSL to functions e1d344a [Ximo Guanter Gonzalbez] SPARK-2186: Spark SQL DSL support for simple aggregations such as SUM and AVG (cherry picked from commit 5c6ec94da1bacd8e65a43acb92b6721493484e7b) Signed-off-by: Michael Armbrust <michael@databricks.com>
author: Ximo Guanter Gonzalbez <ximo@tid.es> 2014-07-02 10:03:44 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-07-02 10:04:08 -0700
commit: 69112b0bff92a9eb7e6ce7d92a12bc99b8c05929 (patch)
tree: 1928be9857367de3c3caaa2972c7df47312eeb91 /sql
parent: a4c7541940d538c50155685b67f5e1910d05c51b (diff)
download: spark-69112b0bff92a9eb7e6ce7d92a12bc99b8c05929.tar.gz
spark-69112b0bff92a9eb7e6ce7d92a12bc99b8c05929.tar.bz2
spark-69112b0bff92a9eb7e6ce7d92a12bc99b8c05929.zip
3 files changed, 44 insertions, 8 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 26ad4837b0..1b503b957d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -108,6 +108,17 @@ package object dsl {
 
     implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name)
 
+    def sum(e: Expression) = Sum(e)
+    def sumDistinct(e: Expression) = SumDistinct(e)
+    def count(e: Expression) = Count(e)
+    def countDistinct(e: Expression*) = CountDistinct(e)
+    def avg(e: Expression) = Average(e)
+    def first(e: Expression) = First(e)
+    def min(e: Expression) = Min(e)
+    def max(e: Expression) = Max(e)
+    def upper(e: Expression) = Upper(e)
+    def lower(e: Expression) = Lower(e)
+
     implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s = sym.name }
     // TODO more implicit class for literal?
     implicit class DslString(val s: String) extends ImplicitOperators {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index 7c0efb4566..8f9f54f610 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -133,8 +133,13 @@ class SchemaRDD(
    *
    * @group Query
    */
-  def select(exprs: NamedExpression*): SchemaRDD =
-    new SchemaRDD(sqlContext, Project(exprs, logicalPlan))
+  def select(exprs: Expression*): SchemaRDD = {
+    val aliases = exprs.zipWithIndex.map {
+      case (ne: NamedExpression, _) => ne
+      case (e, i) => Alias(e, s"c$i")()
+    }
+    new SchemaRDD(sqlContext, Project(aliases, logicalPlan))
+  }
 
   /**
    * Filters the output, only returning those rows where `condition` evaluates to true.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
index fb599e1e01..05aac66d81 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala
@@ -61,6 +61,26 @@ class DslQuerySuite extends QueryTest {
       Seq(Seq("1")))
   }
 
+  test("select with functions") {
+    checkAnswer(
+      testData.select(sum('value), avg('value), count(1)),
+      Seq(Seq(5050.0, 50.5, 100)))
+
+    checkAnswer(
+      testData2.select('a + 'b, 'a < 'b),
+      Seq(
+        Seq(2, false),
+        Seq(3, true),
+        Seq(3, false),
+        Seq(4, false),
+        Seq(4, false),
+        Seq(5, false)))
+
+    checkAnswer(
+      testData2.select(sumDistinct('a)),
+      Seq(Seq(6)))
+  }
+
   test("sorting") {
     checkAnswer(
       testData2.orderBy('a.asc, 'b.asc),
@@ -111,17 +131,17 @@ class DslQuerySuite extends QueryTest {
 
   test("average") {
     checkAnswer(
-      testData2.groupBy()(Average('a)),
+      testData2.groupBy()(avg('a)),
       2.0)
   }
 
   test("null average") {
     checkAnswer(
-      testData3.groupBy()(Average('b)),
+      testData3.groupBy()(avg('b)),
       2.0)
 
     checkAnswer(
-      testData3.groupBy()(Average('b), CountDistinct('b :: Nil)),
+      testData3.groupBy()(avg('b), countDistinct('b)),
       (2.0, 1) :: Nil)
   }
 
@@ -131,17 +151,17 @@ class DslQuerySuite extends QueryTest {
 
   test("null count") {
     checkAnswer(
-      testData3.groupBy('a)('a, Count('b)),
+      testData3.groupBy('a)('a, count('b)),
       Seq((1,0), (2, 1))
     )
 
     checkAnswer(
-      testData3.groupBy('a)('a, Count('a + 'b)),
+      testData3.groupBy('a)('a, count('a + 'b)),
       Seq((1,0), (2, 1))
     )
 
     checkAnswer(
-      testData3.groupBy()(Count('a), Count('b), Count(1), CountDistinct('a :: Nil), CountDistinct('b :: Nil)),
+      testData3.groupBy()(count('a), count('b), count(1), countDistinct('a), countDistinct('b)),
       (2, 1, 2, 2, 1) :: Nil
     )
   }
author	Ximo Guanter Gonzalbez <ximo@tid.es>	2014-07-02 10:03:44 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-07-02 10:04:08 -0700
commit	69112b0bff92a9eb7e6ce7d92a12bc99b8c05929 (patch)
tree	1928be9857367de3c3caaa2972c7df47312eeb91 /sql
parent	a4c7541940d538c50155685b67f5e1910d05c51b (diff)
download	spark-69112b0bff92a9eb7e6ce7d92a12bc99b8c05929.tar.gz spark-69112b0bff92a9eb7e6ce7d92a12bc99b8c05929.tar.bz2 spark-69112b0bff92a9eb7e6ce7d92a12bc99b8c05929.zip