From 363bcedeea40fe3f1a92271b96af2acba63e058c Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 28 Jun 2016 19:36:53 -0700
Subject: [SPARK-16248][SQL] Whitelist the list of Hive fallback functions

## What changes were proposed in this pull request?
This patch removes the blind fallback into Hive for functions. Instead, it creates a whitelist and adds only a small number of functions to the whitelist, i.e. the ones we intend to support in the long run in Spark.

## How was this patch tested?
Updated tests to reflect the change.

Author: Reynold Xin <rxin@databricks.com>

Closes #13939 from rxin/hive-whitelist.
---
 .../sql/catalyst/analysis/FunctionRegistry.scala   |  1 +
 .../hive/execution/HiveCompatibilitySuite.scala    | 22 ++++++------
 .../execution/HiveWindowFunctionQuerySuite.scala   | 25 -------------
 .../apache/spark/sql/hive/HiveSessionCatalog.scala | 42 ++++++++++++++--------
 4 files changed, 40 insertions(+), 50 deletions(-)

(limited to 'sql')

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 42a8faa412..0bde48ce57 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -248,6 +248,7 @@ object FunctionRegistry {
     expression[Average]("mean"),
     expression[Min]("min"),
     expression[Skewness]("skewness"),
+    expression[StddevSamp]("std"),
     expression[StddevSamp]("stddev"),
     expression[StddevPop]("stddev_pop"),
     expression[StddevSamp]("stddev_samp"),
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 2d5a970c12..13d18fdec0 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -517,6 +517,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     // This test uses CREATE EXTERNAL TABLE without specifying LOCATION
     "alter2",
 
+    // [SPARK-16248][SQL] Whitelist the list of Hive fallback functions
+    "udf_field",
+    "udf_reflect2",
+    "udf_xpath",
+    "udf_xpath_boolean",
+    "udf_xpath_double",
+    "udf_xpath_float",
+    "udf_xpath_int",
+    "udf_xpath_long",
+    "udf_xpath_short",
+    "udf_xpath_string",
+
     // These tests DROP TABLE that don't exist (but do not specify IF EXISTS)
     "alter_rename_partition1",
     "date_1",
@@ -1004,7 +1016,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_elt",
     "udf_equal",
     "udf_exp",
-    "udf_field",
     "udf_find_in_set",
     "udf_float",
     "udf_floor",
@@ -1049,7 +1060,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_power",
     "udf_radians",
     "udf_rand",
-    "udf_reflect2",
     "udf_regexp",
     "udf_regexp_extract",
     "udf_regexp_replace",
@@ -1090,14 +1100,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "udf_variance",
     "udf_weekofyear",
     "udf_when",
-    "udf_xpath",
-    "udf_xpath_boolean",
-    "udf_xpath_double",
-    "udf_xpath_float",
-    "udf_xpath_int",
-    "udf_xpath_long",
-    "udf_xpath_short",
-    "udf_xpath_string",
     "union10",
     "union11",
     "union13",
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
index 6c3978154d..7ba5790c29 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala
@@ -534,31 +534,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte
       |             rows between 2 preceding and 2 following);
     """.stripMargin, reset = false)
 
-  // collect_set() output array in an arbitrary order, hence causes different result
-  // when running this test suite under Java 7 and 8.
-  // We change the original sql query a little bit for making the test suite passed
-  // under different JDK
-  /* Disabled because:
-     - Spark uses a different default stddev.
-     - Tiny numerical differences in stddev results.
-  createQueryTest("windowing.q -- 20. testSTATs",
-    """
-      |select p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp
-      |from (
-      |select  p_mfgr,p_name, p_size,
-      |stddev(p_retailprice) over w1 as sdev,
-      |stddev_pop(p_retailprice) over w1 as sdev_pop,
-      |collect_set(p_size) over w1 as uniq_size,
-      |variance(p_retailprice) over w1 as var,
-      |corr(p_size, p_retailprice) over w1 as cor,
-      |covar_pop(p_size, p_retailprice) over w1 as covarp
-      |from part
-      |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name
-      |             rows between 2 preceding and 2 following)
-      |) t lateral view explode(uniq_size) d as uniq_data
-      |order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp
-    """.stripMargin, reset = false)
-  */
   createQueryTest("windowing.q -- 21. testDISTs",
     """
       |select  p_mfgr,p_name, p_size,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index 2f6a220785..8a47dcf908 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -162,17 +162,6 @@ private[sql] class HiveSessionCatalog(
     }
   }
 
-  // We have a list of Hive built-in functions that we do not support. So, we will check
-  // Hive's function registry and lazily load needed functions into our own function registry.
-  // Those Hive built-in functions are
-  // assert_true, collect_list, collect_set, compute_stats, context_ngrams, create_union,
-  // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
-  // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values,
-  // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
-  // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2,
-  // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean,
-  // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number,
-  // xpath_short, and xpath_string.
   override def lookupFunction(name: FunctionIdentifier, children: Seq[Expression]): Expression = {
     // TODO: Once lookupFunction accepts a FunctionIdentifier, we should refactor this method to
     // if (super.functionExists(name)) {
@@ -196,10 +185,12 @@ private[sql] class HiveSessionCatalog(
           // built-in function.
           // Hive is case insensitive.
           val functionName = funcName.unquotedString.toLowerCase
-          // TODO: This may not really work for current_user because current_user is not evaluated
-          // with session info.
-          // We do not need to use executionHive at here because we only load
-          // Hive's builtin functions, which do not need current db.
+          if (!hiveFunctions.contains(functionName)) {
+            failFunctionLookup(funcName.unquotedString)
+          }
+
+          // TODO: Remove this fallback path once we implement the list of fallback functions
+          // defined below in hiveFunctions.
           val functionInfo = {
             try {
               Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse(
@@ -221,4 +212,25 @@ private[sql] class HiveSessionCatalog(
         }
     }
   }
+
+  /** List of functions we pass over to Hive. Note that over time this list should go to 0. */
+  // We have a list of Hive built-in functions that we do not support. So, we will check
+  // Hive's function registry and lazily load needed functions into our own function registry.
+  // List of functions we are explicitly not supporting are:
+  // compute_stats, context_ngrams, create_union,
+  // current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
+  // in_file, index, java_method,
+  // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
+  // parse_url_tuple, posexplode, reflect2,
+  // str_to_map, windowingtablefunction.
+  private val hiveFunctions = Seq(
+    "elt", "hash", "java_method", "histogram_numeric",
+    "map_keys", "map_values",
+    "parse_url", "percentile", "percentile_approx", "reflect", "sentences", "stack", "str_to_map",
+    "xpath", "xpath_boolean", "xpath_double", "xpath_float", "xpath_int", "xpath_long",
+    "xpath_number", "xpath_short", "xpath_string",
+
+    // table generating function
+    "inline", "posexplode"
+  )
 }
-- 
cgit v1.2.3