diff options
Diffstat (limited to 'sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala')
-rw-r--r-- | sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala | 162 |
1 files changed, 155 insertions, 7 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index aa44cba4b5..0cccc22e5a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -17,22 +17,39 @@ package org.apache.spark.sql.hive +import scala.util.{Failure, Success, Try} +import scala.util.control.NonFatal + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.hive.conf.HiveConf +import org.apache.hadoop.hive.ql.exec.{UDAF, UDF} +import org.apache.hadoop.hive.ql.exec.{FunctionRegistry => HiveFunctionRegistry} +import org.apache.hadoop.hive.ql.udf.generic.{AbstractGenericUDAFResolver, GenericUDF, GenericUDTF} + import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.SessionCatalog +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.catalog.{FunctionResourceLoader, SessionCatalog} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.BucketSpec +import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType +import org.apache.spark.util.Utils -class HiveSessionCatalog( - externalCatalog: HiveCatalog, +private[sql] class HiveSessionCatalog( + externalCatalog: HiveExternalCatalog, client: HiveClient, context: HiveContext, + functionResourceLoader: FunctionResourceLoader, + functionRegistry: FunctionRegistry, conf: SQLConf) - extends SessionCatalog(externalCatalog, conf) { + extends SessionCatalog(externalCatalog, functionResourceLoader, functionRegistry, conf) { override def setCurrentDatabase(db: String): Unit = { super.setCurrentDatabase(db) @@ -41,11 +58,11 @@ class HiveSessionCatalog( override def lookupRelation(name: TableIdentifier, alias: Option[String]): LogicalPlan = { val table = formatTableName(name.table) - if (name.database.isDefined || !tempTables.containsKey(table)) { + if (name.database.isDefined || !tempTables.contains(table)) { val newName = name.copy(table = table) metastoreCatalog.lookupRelation(newName, alias) } else { - val relation = tempTables.get(table) + val relation = tempTables(table) val tableWithQualifiers = SubqueryAlias(table, relation) // If an alias was specified by the lookup, wrap the plan in a subquery so that // attributes are properly qualified with this alias. @@ -57,6 +74,11 @@ class HiveSessionCatalog( // | Methods and fields for interacting with HiveMetastoreCatalog | // ---------------------------------------------------------------- + override def getDefaultDBPath(db: String): String = { + val defaultPath = context.hiveconf.getVar(HiveConf.ConfVars.METASTOREWAREHOUSE) + new Path(new Path(defaultPath), db + ".db").toString + } + // Catalog for handling data source tables. TODO: This really doesn't belong here since it is // essentially a cache for metastore tables. However, it relies on a lot of session-specific // things so it would be a lot of work to split its functionality between HiveSessionCatalog @@ -64,6 +86,7 @@ class HiveSessionCatalog( private val metastoreCatalog = new HiveMetastoreCatalog(client, context) val ParquetConversions: Rule[LogicalPlan] = metastoreCatalog.ParquetConversions + val OrcConversions: Rule[LogicalPlan] = metastoreCatalog.OrcConversions val CreateTables: Rule[LogicalPlan] = metastoreCatalog.CreateTables val PreInsertionCasts: Rule[LogicalPlan] = metastoreCatalog.PreInsertionCasts @@ -71,7 +94,7 @@ class HiveSessionCatalog( metastoreCatalog.refreshTable(name) } - def invalidateTable(name: TableIdentifier): Unit = { + override def invalidateTable(name: TableIdentifier): Unit = { metastoreCatalog.invalidateTable(name) } @@ -101,4 +124,129 @@ class HiveSessionCatalog( metastoreCatalog.cachedDataSourceTables.getIfPresent(key) } + override def makeFunctionBuilder(funcName: String, className: String): FunctionBuilder = { + makeFunctionBuilder(funcName, Utils.classForName(className)) + } + + /** + * Construct a [[FunctionBuilder]] based on the provided class that represents a function. + */ + private def makeFunctionBuilder(name: String, clazz: Class[_]): FunctionBuilder = { + // When we instantiate hive UDF wrapper class, we may throw exception if the input + // expressions don't satisfy the hive UDF, such as type mismatch, input number + // mismatch, etc. Here we catch the exception and throw AnalysisException instead. + (children: Seq[Expression]) => { + try { + if (classOf[UDF].isAssignableFrom(clazz)) { + val udf = HiveSimpleUDF(name, new HiveFunctionWrapper(clazz.getName), children) + udf.dataType // Force it to check input data types. + udf + } else if (classOf[GenericUDF].isAssignableFrom(clazz)) { + val udf = HiveGenericUDF(name, new HiveFunctionWrapper(clazz.getName), children) + udf.dataType // Force it to check input data types. + udf + } else if (classOf[AbstractGenericUDAFResolver].isAssignableFrom(clazz)) { + val udaf = HiveUDAFFunction(name, new HiveFunctionWrapper(clazz.getName), children) + udaf.dataType // Force it to check input data types. + udaf + } else if (classOf[UDAF].isAssignableFrom(clazz)) { + val udaf = HiveUDAFFunction( + name, + new HiveFunctionWrapper(clazz.getName), + children, + isUDAFBridgeRequired = true) + udaf.dataType // Force it to check input data types. + udaf + } else if (classOf[GenericUDTF].isAssignableFrom(clazz)) { + val udtf = HiveGenericUDTF(name, new HiveFunctionWrapper(clazz.getName), children) + udtf.elementTypes // Force it to check input data types. + udtf + } else { + throw new AnalysisException(s"No handler for Hive UDF '${clazz.getCanonicalName}'") + } + } catch { + case ae: AnalysisException => + throw ae + case NonFatal(e) => + val analysisException = + new AnalysisException(s"No handler for Hive UDF '${clazz.getCanonicalName}': $e") + analysisException.setStackTrace(e.getStackTrace) + throw analysisException + } + } + } + + // We have a list of Hive built-in functions that we do not support. So, we will check + // Hive's function registry and lazily load needed functions into our own function registry. + // Those Hive built-in functions are + // assert_true, collect_list, collect_set, compute_stats, context_ngrams, create_union, + // current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field, + // histogram_numeric, in_file, index, inline, java_method, map_keys, map_values, + // matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming, + // parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2, + // regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean, + // xpath_double, xpath_float, xpath_int, xpath_long, xpath_number, + // xpath_short, and xpath_string. + override def lookupFunction(name: String, children: Seq[Expression]): Expression = { + // TODO: Once lookupFunction accepts a FunctionIdentifier, we should refactor this method to + // if (super.functionExists(name)) { + // super.lookupFunction(name, children) + // } else { + // // This function is a Hive builtin function. + // ... + // } + Try(super.lookupFunction(name, children)) match { + case Success(expr) => expr + case Failure(error) => + if (functionRegistry.functionExists(name)) { + // If the function actually exists in functionRegistry, it means that there is an + // error when we create the Expression using the given children. + // We need to throw the original exception. + throw error + } else { + // This function is not in functionRegistry, let's try to load it as a Hive's + // built-in function. + // Hive is case insensitive. + val functionName = name.toLowerCase + // TODO: This may not really work for current_user because current_user is not evaluated + // with session info. + // We do not need to use executionHive at here because we only load + // Hive's builtin functions, which do not need current db. + val functionInfo = { + try { + Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse( + failFunctionLookup(name)) + } catch { + // If HiveFunctionRegistry.getFunctionInfo throws an exception, + // we are failing to load a Hive builtin function, which means that + // the given function is not a Hive builtin function. + case NonFatal(e) => failFunctionLookup(name) + } + } + val className = functionInfo.getFunctionClass.getName + val builder = makeFunctionBuilder(functionName, className) + // Put this Hive built-in function to our function registry. + val info = new ExpressionInfo(className, functionName) + createTempFunction(functionName, info, builder, ignoreIfExists = false) + // Now, we need to create the Expression. + functionRegistry.lookupFunction(functionName, children) + } + } + } + + // Pre-load a few commonly used Hive built-in functions. + HiveSessionCatalog.preloadedHiveBuiltinFunctions.foreach { + case (functionName, clazz) => + val builder = makeFunctionBuilder(functionName, clazz) + val info = new ExpressionInfo(clazz.getCanonicalName, functionName) + createTempFunction(functionName, info, builder, ignoreIfExists = false) + } +} + +private[sql] object HiveSessionCatalog { + // This is the list of Hive's built-in functions that are commonly used and we want to + // pre-load when we create the FunctionRegistry. + val preloadedHiveBuiltinFunctions = + ("collect_set", classOf[org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectSet]) :: + ("collect_list", classOf[org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCollectList]) :: Nil } |