aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2016-02-02 11:50:14 -0800
committerDavies Liu <davies.liu@gmail.com>2016-02-02 11:50:14 -0800
commitbe5dd881f1eff248224a92d57cfd1309cb3acf38 (patch)
tree7fdf890c80dc6a7e63028b0829f1020ca0c65a54 /sql/hive
parent7f6e3ec79b77400f558ceffa10b2af011962115f (diff)
downloadspark-be5dd881f1eff248224a92d57cfd1309cb3acf38.tar.gz
spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.tar.bz2
spark-be5dd881f1eff248224a92d57cfd1309cb3acf38.zip
[SPARK-12913] [SQL] Improve performance of stat functions
As benchmarked and discussed here: https://github.com/apache/spark/pull/10786/files#r50038294, benefits from codegen, the declarative aggregate function could be much faster than imperative one. Author: Davies Liu <davies@databricks.com> Closes #10960 from davies/stddev.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala4
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala17
2 files changed, 10 insertions, 11 deletions
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 554d47d651..61b73fa557 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -325,6 +325,9 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"drop_partitions_ignore_protection",
"protectmode",
+ // Hive returns null rather than NaN when n = 1
+ "udaf_covar_samp",
+
// Spark parser treats numerical literals differently: it creates decimals instead of doubles.
"udf_abs",
"udf_format_number",
@@ -881,7 +884,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"type_widening",
"udaf_collect_set",
"udaf_covar_pop",
- "udaf_covar_samp",
"udaf_histogram_numeric",
"udf2",
"udf5",
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index 7a9ed1eaf3..caf1db9ad0 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -798,7 +798,7 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
"""
|SELECT corr(b, c) FROM covar_tab WHERE a = 3
""".stripMargin),
- Row(null) :: Nil)
+ Row(Double.NaN) :: Nil)
checkAnswer(
sqlContext.sql(
@@ -807,10 +807,10 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
""".stripMargin),
Row(1, null) ::
Row(2, null) ::
- Row(3, null) ::
- Row(4, null) ::
- Row(5, null) ::
- Row(6, null) :: Nil)
+ Row(3, Double.NaN) ::
+ Row(4, Double.NaN) ::
+ Row(5, Double.NaN) ::
+ Row(6, Double.NaN) :: Nil)
val corr7 = sqlContext.sql("SELECT corr(b, c) FROM covar_tab").collect()(0).getDouble(0)
assert(math.abs(corr7 - 0.6633880657639323) < 1e-12)
@@ -841,11 +841,8 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
// one row test
val df3 = Seq.tabulate(1)(x => (1 * x, x * x * x - 2)).toDF("a", "b")
- val cov_samp3 = df3.groupBy().agg(covar_samp("a", "b")).collect()(0).get(0)
- assert(cov_samp3 == null)
-
- val cov_pop3 = df3.groupBy().agg(covar_pop("a", "b")).collect()(0).getDouble(0)
- assert(cov_pop3 == 0.0)
+ checkAnswer(df3.groupBy().agg(covar_samp("a", "b")), Row(Double.NaN))
+ checkAnswer(df3.groupBy().agg(covar_pop("a", "b")), Row(0.0))
}
test("no aggregation function (SPARK-11486)") {