From 363bcedeea40fe3f1a92271b96af2acba63e058c Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 28 Jun 2016 19:36:53 -0700 Subject: [SPARK-16248][SQL] Whitelist the list of Hive fallback functions ## What changes were proposed in this pull request? This patch removes the blind fallback into Hive for functions. Instead, it creates a whitelist and adds only a small number of functions to the whitelist, i.e. the ones we intend to support in the long run in Spark. ## How was this patch tested? Updated tests to reflect the change. Author: Reynold Xin Closes #13939 from rxin/hive-whitelist. --- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../hive/execution/HiveCompatibilitySuite.scala | 22 ++++++------ .../execution/HiveWindowFunctionQuerySuite.scala | 25 ------------- .../apache/spark/sql/hive/HiveSessionCatalog.scala | 42 ++++++++++++++-------- 4 files changed, 40 insertions(+), 50 deletions(-) (limited to 'sql') diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 42a8faa412..0bde48ce57 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -248,6 +248,7 @@ object FunctionRegistry { expression[Average]("mean"), expression[Min]("min"), expression[Skewness]("skewness"), + expression[StddevSamp]("std"), expression[StddevSamp]("stddev"), expression[StddevPop]("stddev_pop"), expression[StddevSamp]("stddev_samp"), diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index 2d5a970c12..13d18fdec0 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -517,6 +517,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { // This test uses CREATE EXTERNAL TABLE without specifying LOCATION "alter2", + // [SPARK-16248][SQL] Whitelist the list of Hive fallback functions + "udf_field", + "udf_reflect2", + "udf_xpath", + "udf_xpath_boolean", + "udf_xpath_double", + "udf_xpath_float", + "udf_xpath_int", + "udf_xpath_long", + "udf_xpath_short", + "udf_xpath_string", + // These tests DROP TABLE that don't exist (but do not specify IF EXISTS) "alter_rename_partition1", "date_1", @@ -1004,7 +1016,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_elt", "udf_equal", "udf_exp", - "udf_field", "udf_find_in_set", "udf_float", "udf_floor", @@ -1049,7 +1060,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_power", "udf_radians", "udf_rand", - "udf_reflect2", "udf_regexp", "udf_regexp_extract", "udf_regexp_replace", @@ -1090,14 +1100,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { "udf_variance", "udf_weekofyear", "udf_when", - "udf_xpath", - "udf_xpath_boolean", - "udf_xpath_double", - "udf_xpath_float", - "udf_xpath_int", - "udf_xpath_long", - "udf_xpath_short", - "udf_xpath_string", "union10", "union11", "union13", diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala index 6c3978154d..7ba5790c29 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala @@ -534,31 +534,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte | rows between 2 preceding and 2 following); """.stripMargin, reset = false) - // collect_set() output array in an arbitrary order, hence causes different result - // when running this test suite under Java 7 and 8. - // We change the original sql query a little bit for making the test suite passed - // under different JDK - /* Disabled because: - - Spark uses a different default stddev. - - Tiny numerical differences in stddev results. - createQueryTest("windowing.q -- 20. testSTATs", - """ - |select p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp - |from ( - |select p_mfgr,p_name, p_size, - |stddev(p_retailprice) over w1 as sdev, - |stddev_pop(p_retailprice) over w1 as sdev_pop, - |collect_set(p_size) over w1 as uniq_size, - |variance(p_retailprice) over w1 as var, - |corr(p_size, p_retailprice) over w1 as cor, - |covar_pop(p_size, p_retailprice) over w1 as covarp - |from part - |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name - | rows between 2 preceding and 2 following) - |) t lateral view explode(uniq_size) d as uniq_data - |order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp - """.stripMargin, reset = false) - */ createQueryTest("windowing.q -- 21. testDISTs", """ |select p_mfgr,p_name, p_size, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 2f6a220785..8a47dcf908 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -162,17 +162,6 @@ private[sql] class HiveSessionCatalog( } } - // We have a list of Hive built-in functions that we do not support. So, we will check - // Hive's function registry and lazily load needed functions into our own function registry. - // Those Hive built-in functions are - // assert_true, collect_list, collect_set, compute_stats, context_ngrams, create_union, - // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, - // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values, - // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming, - // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2, - // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean, - // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number, - // xpath_short, and xpath_string. override def lookupFunction(name: FunctionIdentifier, children: Seq[Expression]): Expression = { // TODO: Once lookupFunction accepts a FunctionIdentifier, we should refactor this method to // if (super.functionExists(name)) { @@ -196,10 +185,12 @@ private[sql] class HiveSessionCatalog( // built-in function. // Hive is case insensitive. val functionName = funcName.unquotedString.toLowerCase - // TODO: This may not really work for current_user because current_user is not evaluated - // with session info. - // We do not need to use executionHive at here because we only load - // Hive's builtin functions, which do not need current db. + if (!hiveFunctions.contains(functionName)) { + failFunctionLookup(funcName.unquotedString) + } + + // TODO: Remove this fallback path once we implement the list of fallback functions + // defined below in hiveFunctions. val functionInfo = { try { Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse( @@ -221,4 +212,25 @@ private[sql] class HiveSessionCatalog( } } } + + /** List of functions we pass over to Hive. Note that over time this list should go to 0. */ + // We have a list of Hive built-in functions that we do not support. So, we will check + // Hive's function registry and lazily load needed functions into our own function registry. + // List of functions we are explicitly not supporting are: + // compute_stats, context_ngrams, create_union, + // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, + // in_file, index, java_method, + // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming, + // parse_url_tuple, posexplode, reflect2, + // str_to_map, windowingtablefunction. + private val hiveFunctions = Seq( + "elt", "hash", "java_method", "histogram_numeric", + "map_keys", "map_values", + "parse_url", "percentile", "percentile_approx", "reflect", "sentences", "stack", "str_to_map", + "xpath", "xpath_boolean", "xpath_double", "xpath_float", "xpath_int", "xpath_long", + "xpath_number", "xpath_short", "xpath_string", + + // table generating function + "inline", "posexplode" + ) } -- cgit v1.2.3