[SPARK-12913] [SQL] Improve performance of stat functions

As benchmarked and discussed here: https://github.com/apache/spark/pull/10786/files#r50038294, benefits from codegen, the declarative aggregate function could be much faster than imperative one. Author: Davies Liu <davies@databricks.com> Closes #10960 from davies/stddev.
author: Davies Liu <davies@databricks.com> 2016-02-02 11:50:14 -0800
committer: Davies Liu <davies.liu@gmail.com> 2016-02-02 11:50:14 -0800
commit: be5dd881f1eff248224a92d57cfd1309cb3acf38 (patch)
tree: 7fdf890c80dc6a7e63028b0829f1020ca0c65a54 /sql/hive
parent: 7f6e3ec79b77400f558ceffa10b2af011962115f (diff)
download: spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.tar.gz
spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.tar.bz2
spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.zip
2 files changed, 10 insertions, 11 deletions
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 554d47d651..61b73fa557 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -325,6 +325,9 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "drop_partitions_ignore_protection",
     "protectmode",
 
+    // Hive returns null rather than NaN when n = 1
+    "udaf_covar_samp",
+
     // Spark parser treats numerical literals differently: it creates decimals instead of doubles.
     "udf_abs",
     "udf_format_number",
@@ -881,7 +884,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "type_widening",
     "udaf_collect_set",
     "udaf_covar_pop",
-    "udaf_covar_samp",
     "udaf_histogram_numeric",
     "udf2",
     "udf5",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 7a9ed1eaf3..caf1db9ad0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -798,7 +798,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
         """
           |SELECT corr(b, c) FROM covar_tab WHERE a = 3
         """.stripMargin),
-      Row(null) :: Nil)
+      Row(Double.NaN) :: Nil)
 
     checkAnswer(
       sqlContext.sql(
@@ -807,10 +807,10 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
         """.stripMargin),
       Row(1, null) ::
       Row(2, null) ::
-      Row(3, null) ::
-      Row(4, null) ::
-      Row(5, null) ::
-      Row(6, null) :: Nil)
+      Row(3, Double.NaN) ::
+      Row(4, Double.NaN) ::
+      Row(5, Double.NaN) ::
+      Row(6, Double.NaN) :: Nil)
 
     val corr7 = sqlContext.sql("SELECT corr(b, c) FROM covar_tab").collect()(0).getDouble(0)
     assert(math.abs(corr7 - 0.6633880657639323) < 1e-12)
@@ -841,11 +841,8 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
 
     // one row test
     val df3 = Seq.tabulate(1)(x => (1 * x, x * x * x - 2)).toDF("a", "b")
-    val cov_samp3 = df3.groupBy().agg(covar_samp("a", "b")).collect()(0).get(0)
-    assert(cov_samp3 == null)
-
-    val cov_pop3 = df3.groupBy().agg(covar_pop("a", "b")).collect()(0).getDouble(0)
-    assert(cov_pop3 == 0.0)
+    checkAnswer(df3.groupBy().agg(covar_samp("a", "b")), Row(Double.NaN))
+    checkAnswer(df3.groupBy().agg(covar_pop("a", "b")), Row(0.0))
   }
 
   test("no aggregation function (SPARK-11486)") {
author	Davies Liu <davies@databricks.com>	2016-02-02 11:50:14 -0800
committer	Davies Liu <davies.liu@gmail.com>	2016-02-02 11:50:14 -0800
commit	be5dd881f1eff248224a92d57cfd1309cb3acf38 (patch)
tree	7fdf890c80dc6a7e63028b0829f1020ca0c65a54 /sql/hive
parent	7f6e3ec79b77400f558ceffa10b2af011962115f (diff)
download	spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.tar.gz spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.tar.bz2 spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.zip