aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2016-06-28 19:36:53 -0700
committerReynold Xin <rxin@databricks.com>2016-06-28 19:36:53 -0700
commit363bcedeea40fe3f1a92271b96af2acba63e058c (patch)
tree5609cc683dc3c7c15933bd63cc191d81996912f3
parent5bf8881b34a18f25acc10aeb28a06af4c44a6ac8 (diff)
downloadspark-363bcedeea40fe3f1a92271b96af2acba63e058c.tar.gz
spark-363bcedeea40fe3f1a92271b96af2acba63e058c.tar.bz2
spark-363bcedeea40fe3f1a92271b96af2acba63e058c.zip
[SPARK-16248][SQL] Whitelist the list of Hive fallback functions
## What changes were proposed in this pull request? This patch removes the blind fallback into Hive for functions. Instead, it creates a whitelist and adds only a small number of functions to the whitelist, i.e. the ones we intend to support in the long run in Spark. ## How was this patch tested? Updated tests to reflect the change. Author: Reynold Xin <rxin@databricks.com> Closes #13939 from rxin/hive-whitelist.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala1
-rw-r--r--sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala22
-rw-r--r--sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala25
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala42
4 files changed, 40 insertions, 50 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 42a8faa412..0bde48ce57 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -248,6 +248,7 @@ object FunctionRegistry {
expression[Average]("mean"),
expression[Min]("min"),
expression[Skewness]("skewness"),
+ expression[StddevSamp]("std"),
expression[StddevSamp]("stddev"),
expression[StddevPop]("stddev_pop"),
expression[StddevSamp]("stddev_samp"),
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 2d5a970c12..13d18fdec0 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -517,6 +517,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
// This test uses CREATE EXTERNAL TABLE without specifying LOCATION
"alter2",
+ // [SPARK-16248][SQL] Whitelist the list of Hive fallback functions
+ "udf_field",
+ "udf_reflect2",
+ "udf_xpath",
+ "udf_xpath_boolean",
+ "udf_xpath_double",
+ "udf_xpath_float",
+ "udf_xpath_int",
+ "udf_xpath_long",
+ "udf_xpath_short",
+ "udf_xpath_string",
+
// These tests DROP TABLE that don't exist (but do not specify IF EXISTS)
"alter_rename_partition1",
"date_1",
@@ -1004,7 +1016,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_elt",
"udf_equal",
"udf_exp",
- "udf_field",
"udf_find_in_set",
"udf_float",
"udf_floor",
@@ -1049,7 +1060,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_power",
"udf_radians",
"udf_rand",
- "udf_reflect2",
"udf_regexp",
"udf_regexp_extract",
"udf_regexp_replace",
@@ -1090,14 +1100,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_variance",
"udf_weekofyear",
"udf_when",
- "udf_xpath",
- "udf_xpath_boolean",
- "udf_xpath_double",
- "udf_xpath_float",
- "udf_xpath_int",
- "udf_xpath_long",
- "udf_xpath_short",
- "udf_xpath_string",
"union10",
"union11",
"union13",
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
index 6c3978154d..7ba5790c29 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
@@ -534,31 +534,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte
| rows between 2 preceding and 2 following);
""".stripMargin, reset = false)
- // collect_set() output array in an arbitrary order, hence causes different result
- // when running this test suite under Java 7 and 8.
- // We change the original sql query a little bit for making the test suite passed
- // under different JDK
- /* Disabled because:
- - Spark uses a different default stddev.
- - Tiny numerical differences in stddev results.
- createQueryTest("windowing.q -- 20. testSTATs",
- """
- |select p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp
- |from (
- |select p_mfgr,p_name, p_size,
- |stddev(p_retailprice) over w1 as sdev,
- |stddev_pop(p_retailprice) over w1 as sdev_pop,
- |collect_set(p_size) over w1 as uniq_size,
- |variance(p_retailprice) over w1 as var,
- |corr(p_size, p_retailprice) over w1 as cor,
- |covar_pop(p_size, p_retailprice) over w1 as covarp
- |from part
- |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name
- | rows between 2 preceding and 2 following)
- |) t lateral view explode(uniq_size) d as uniq_data
- |order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp
- """.stripMargin, reset = false)
- */
createQueryTest("windowing.q -- 21. testDISTs",
"""
|select p_mfgr,p_name, p_size,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 2f6a220785..8a47dcf908 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -162,17 +162,6 @@ private[sql] class HiveSessionCatalog(
}
}
- // We have a list of Hive built-in functions that we do not support. So, we will check
- // Hive's function registry and lazily load needed functions into our own function registry.
- // Those Hive built-in functions are
- // assert_true, collect_list, collect_set, compute_stats, context_ngrams, create_union,
- // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
- // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values,
- // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
- // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2,
- // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean,
- // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number,
- // xpath_short, and xpath_string.
override def lookupFunction(name: FunctionIdentifier, children: Seq[Expression]): Expression = {
// TODO: Once lookupFunction accepts a FunctionIdentifier, we should refactor this method to
// if (super.functionExists(name)) {
@@ -196,10 +185,12 @@ private[sql] class HiveSessionCatalog(
// built-in function.
// Hive is case insensitive.
val functionName = funcName.unquotedString.toLowerCase
- // TODO: This may not really work for current_user because current_user is not evaluated
- // with session info.
- // We do not need to use executionHive at here because we only load
- // Hive's builtin functions, which do not need current db.
+ if (!hiveFunctions.contains(functionName)) {
+ failFunctionLookup(funcName.unquotedString)
+ }
+
+ // TODO: Remove this fallback path once we implement the list of fallback functions
+ // defined below in hiveFunctions.
val functionInfo = {
try {
Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse(
@@ -221,4 +212,25 @@ private[sql] class HiveSessionCatalog(
}
}
}
+
+ /** List of functions we pass over to Hive. Note that over time this list should go to 0. */
+ // We have a list of Hive built-in functions that we do not support. So, we will check
+ // Hive's function registry and lazily load needed functions into our own function registry.
+ // List of functions we are explicitly not supporting are:
+ // compute_stats, context_ngrams, create_union,
+ // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
+ // in_file, index, java_method,
+ // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
+ // parse_url_tuple, posexplode, reflect2,
+ // str_to_map, windowingtablefunction.
+ private val hiveFunctions = Seq(
+ "elt", "hash", "java_method", "histogram_numeric",
+ "map_keys", "map_values",
+ "parse_url", "percentile", "percentile_approx", "reflect", "sentences", "stack", "str_to_map",
+ "xpath", "xpath_boolean", "xpath_double", "xpath_float", "xpath_int", "xpath_long",
+ "xpath_number", "xpath_short", "xpath_string",
+
+ // table generating function
+ "inline", "posexplode"
+ )
}