aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2017-03-06 10:36:50 -0800
committerCheng Lian <lian@databricks.com>2017-03-06 10:36:50 -0800
commit339b53a1311e08521d84a83c94201fcf3c766fb2 (patch)
tree929a7dc8b83acd79676049e9b0a43b771a54f7ed
parent2a0bc867a4a1dad4ecac47701199e540d345ff4f (diff)
downloadspark-339b53a1311e08521d84a83c94201fcf3c766fb2.tar.gz
spark-339b53a1311e08521d84a83c94201fcf3c766fb2.tar.bz2
spark-339b53a1311e08521d84a83c94201fcf3c766fb2.zip
[SPARK-19737][SQL] New analysis rule for reporting unregistered functions without relying on relation resolution
## What changes were proposed in this pull request? This PR adds a new `Once` analysis rule batch consists of a single analysis rule `LookupFunctions` that performs simple existence check over `UnresolvedFunctions` without actually resolving them. The benefit of this rule is that it doesn't require function arguments to be resolved first and therefore doesn't rely on relation resolution, which may incur potentially expensive partition/schema discovery cost. Please refer to [SPARK-19737][1] for more details about the motivation. ## How was this patch tested? New test case added in `AnalysisErrorSuite`. [1]: https://issues.apache.org/jira/browse/SPARK-19737 Author: Cheng Lian <lian@databricks.com> Closes #17168 from liancheng/spark-19737-lookup-functions.
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala21
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala23
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala5
3 files changed, 48 insertions, 1 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6d569b612d..2f8489de6b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -117,6 +117,8 @@ class Analyzer(
Batch("Hints", fixedPoint,
new ResolveHints.ResolveBroadcastHints(conf),
ResolveHints.RemoveAllHints),
+ Batch("Simple Sanity Check", Once,
+ LookupFunctions),
Batch("Substitution", fixedPoint,
CTESubstitution,
WindowsSubstitution,
@@ -1039,6 +1041,25 @@ class Analyzer(
}
/**
+ * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the
+ * function registry. Note that this rule doesn't try to resolve the [[UnresolvedFunction]]. It
+ * only performs simple existence check according to the function identifier to quickly identify
+ * undefined functions without triggering relation resolution, which may incur potentially
+ * expensive partition/schema discovery process in some cases.
+ *
+ * @see [[ResolveFunctions]]
+ * @see https://issues.apache.org/jira/browse/SPARK-19737
+ */
+ object LookupFunctions extends Rule[LogicalPlan] {
+ override def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressions {
+ case f: UnresolvedFunction if !catalog.functionExists(f.name) =>
+ withPosition(f) {
+ throw new NoSuchFunctionException(f.name.database.getOrElse("default"), f.name.funcName)
+ }
+ }
+ }
+
+ /**
* Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s.
*/
object ResolveFunctions extends Rule[LogicalPlan] {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index a755231962..ffc272c6c0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -18,7 +18,7 @@
package org.apache.spark.sql.catalyst.catalog
import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, SimpleCatalystConf, TableIdentifier}
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
@@ -1196,4 +1196,25 @@ class SessionCatalogSuite extends PlanTest {
catalog.listFunctions("unknown_db", "func*")
}
}
+
+ test("SPARK-19737: detect undefined functions without triggering relation resolution") {
+ import org.apache.spark.sql.catalyst.dsl.plans._
+
+ Seq(true, false) foreach { caseSensitive =>
+ val conf = SimpleCatalystConf(caseSensitive)
+ val catalog = new SessionCatalog(newBasicCatalog(), new SimpleFunctionRegistry, conf)
+ val analyzer = new Analyzer(catalog, conf)
+
+ // The analyzer should report the undefined function rather than the undefined table first.
+ val cause = intercept[AnalysisException] {
+ analyzer.execute(
+ UnresolvedRelation(TableIdentifier("undefined_table")).select(
+ UnresolvedFunction("undefined_fn", Nil, isDistinct = false)
+ )
+ )
+ }
+
+ assert(cause.getMessage.contains("Undefined function: 'undefined_fn'"))
+ }
+ }
}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
index c9be1b9d10..f1ea86890c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala
@@ -199,6 +199,11 @@ private[sql] class HiveSessionCatalog(
}
}
+ // TODO Removes this method after implementing Spark native "histogram_numeric".
+ override def functionExists(name: FunctionIdentifier): Boolean = {
+ super.functionExists(name) || hiveFunctions.contains(name.funcName)
+ }
+
/** List of functions we pass over to Hive. Note that over time this list should go to 0. */
// We have a list of Hive built-in functions that we do not support. So, we will check
// Hive's function registry and lazily load needed functions into our own function registry.