[SPARK-13713][SQL] Migrate parser from ANTLR3 to ANTLR4

### What changes were proposed in this pull request? The current ANTLR3 parser is quite complex to maintain and suffers from code blow-ups. This PR introduces a new parser that is based on ANTLR4. This parser is based on the [Presto's SQL parser](https://github.com/facebook/presto/blob/master/presto-parser/src/main/antlr4/com/facebook/presto/sql/parser/SqlBase.g4). The current implementation can parse and create Catalyst and SQL plans. Large parts of the HiveQl DDL and some of the DML functionality is currently missing, the plan is to add this in follow-up PRs. This PR is a work in progress, and work needs to be done in the following area's: - [x] Error handling should be improved. - [x] Documentation should be improved. - [x] Multi-Insert needs to be tested. - [ ] Naming and package locations. ### How was this patch tested? Catalyst and SQL unit tests. Author: Herman van Hovell <hvanhovell@questtec.nl> Closes #11557 from hvanhovell/ngParser.
author: Herman van Hovell <hvanhovell@questtec.nl> 2016-03-28 12:31:12 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-03-28 12:31:12 -0700
commit: 600c0b69cab4767e8e5a6f4284777d8b9d4bd40e (patch)
tree: bae635ab17a8b58400127f20bbbe5acaecc92f98 /sql/core/src
parent: 1528ff4c9affe1df103c4b3abd56a86c71d8b753 (diff)
download: spark-600c0b69cab4767e8e5a6f4284777d8b9d4bd40e.tar.gz
spark-600c0b69cab4767e8e5a6f4284777d8b9d4bd40e.tar.bz2
spark-600c0b69cab4767e8e5a6f4284777d8b9d4bd40e.zip
5 files changed, 225 insertions, 7 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
new file mode 100644
index 0000000000..c098fa99c2
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.parser.ng.{AbstractSqlParser, AstBuilder}
+import org.apache.spark.sql.catalyst.parser.ng.SqlBaseParser._
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
+import org.apache.spark.sql.execution.command.{DescribeCommand => _, _}
+import org.apache.spark.sql.execution.datasources._
+
+/**
+ * Concrete parser for Spark SQL statements.
+ */
+object SparkSqlParser extends AbstractSqlParser{
+  val astBuilder = new SparkSqlAstBuilder
+}
+
+/**
+ * Builder that converts an ANTLR ParseTree into a LogicalPlan/Expression/TableIdentifier.
+ */
+class SparkSqlAstBuilder extends AstBuilder {
+  import org.apache.spark.sql.catalyst.parser.ng.ParserUtils._
+
+  /**
+   * Create a [[SetCommand]] logical plan.
+   *
+   * Note that we assume that everything after the SET keyword is assumed to be a part of the
+   * key-value pair. The split between key and value is made by searching for the first `=`
+   * character in the raw string.
+   */
+  override def visitSetConfiguration(ctx: SetConfigurationContext): LogicalPlan = withOrigin(ctx) {
+    // Construct the command.
+    val raw = remainder(ctx.SET.getSymbol)
+    val keyValueSeparatorIndex = raw.indexOf('=')
+    if (keyValueSeparatorIndex >= 0) {
+      val key = raw.substring(0, keyValueSeparatorIndex).trim
+      val value = raw.substring(keyValueSeparatorIndex + 1).trim
+      SetCommand(Some(key -> Option(value)))
+    } else if (raw.nonEmpty) {
+      SetCommand(Some(raw.trim -> None))
+    } else {
+      SetCommand(None)
+    }
+  }
+
+  /**
+   * Create a [[SetDatabaseCommand]] logical plan.
+   */
+  override def visitUse(ctx: UseContext): LogicalPlan = withOrigin(ctx) {
+    SetDatabaseCommand(ctx.db.getText)
+  }
+
+  /**
+   * Create a [[ShowTablesCommand]] logical plan.
+   */
+  override def visitShowTables(ctx: ShowTablesContext): LogicalPlan = withOrigin(ctx) {
+    if (ctx.LIKE != null) {
+      logWarning("SHOW TABLES LIKE option is ignored.")
+    }
+    ShowTablesCommand(Option(ctx.db).map(_.getText))
+  }
+
+  /**
+   * Create a [[RefreshTable]] logical plan.
+   */
+  override def visitRefreshTable(ctx: RefreshTableContext): LogicalPlan = withOrigin(ctx) {
+    RefreshTable(visitTableIdentifier(ctx.tableIdentifier))
+  }
+
+  /**
+   * Create a [[CacheTableCommand]] logical plan.
+   */
+  override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) {
+    val query = Option(ctx.query).map(plan)
+    CacheTableCommand(ctx.identifier.getText, query, ctx.LAZY != null)
+  }
+
+  /**
+   * Create an [[UncacheTableCommand]] logical plan.
+   */
+  override def visitUncacheTable(ctx: UncacheTableContext): LogicalPlan = withOrigin(ctx) {
+    UncacheTableCommand(ctx.identifier.getText)
+  }
+
+  /**
+   * Create a [[ClearCacheCommand]] logical plan.
+   */
+  override def visitClearCache(ctx: ClearCacheContext): LogicalPlan = withOrigin(ctx) {
+    ClearCacheCommand
+  }
+
+  /**
+   * Create an [[ExplainCommand]] logical plan.
+   */
+  override def visitExplain(ctx: ExplainContext): LogicalPlan = withOrigin(ctx) {
+    val options = ctx.explainOption.asScala
+    if (options.exists(_.FORMATTED != null)) {
+      logWarning("EXPLAIN FORMATTED option is ignored.")
+    }
+    if (options.exists(_.LOGICAL != null)) {
+      logWarning("EXPLAIN LOGICAL option is ignored.")
+    }
+
+    // Create the explain comment.
+    val statement = plan(ctx.statement)
+    if (isExplainableStatement(statement)) {
+      ExplainCommand(statement, extended = options.exists(_.EXTENDED != null))
+    } else {
+      ExplainCommand(OneRowRelation)
+    }
+  }
+
+  /**
+   * Determine if a plan should be explained at all.
+   */
+  protected def isExplainableStatement(plan: LogicalPlan): Boolean = plan match {
+    case _: datasources.DescribeCommand => false
+    case _ => true
+  }
+
+  /**
+   * Create a [[DescribeCommand]] logical plan.
+   */
+  override def visitDescribeTable(ctx: DescribeTableContext): LogicalPlan = withOrigin(ctx) {
+    // FORMATTED and columns are not supported. Return null and let the parser decide what to do
+    // with this (create an exception or pass it on to a different system).
+    if (ctx.describeColName != null || ctx.FORMATTED != null || ctx.partitionSpec != null) {
+      null
+    } else {
+      datasources.DescribeCommand(
+        visitTableIdentifier(ctx.tableIdentifier),
+        ctx.EXTENDED != null)
+    }
+  }
+
+  /** Type to keep track of a table header. */
+  type TableHeader = (TableIdentifier, Boolean, Boolean, Boolean)
+
+  /**
+   * Validate a create table statement and return the [[TableIdentifier]].
+   */
+  override def visitCreateTableHeader(
+      ctx: CreateTableHeaderContext): TableHeader = withOrigin(ctx) {
+    val temporary = ctx.TEMPORARY != null
+    val ifNotExists = ctx.EXISTS != null
+    assert(!temporary || !ifNotExists,
+      "a CREATE TEMPORARY TABLE statement does not allow IF NOT EXISTS clause.",
+      ctx)
+    (visitTableIdentifier(ctx.tableIdentifier), temporary, ifNotExists, ctx.EXTERNAL != null)
+  }
+
+  /**
+   * Create a [[CreateTableUsing]] or a [[CreateTableUsingAsSelect]] logical plan.
+   *
+   * TODO add bucketing and partitioning.
+   */
+  override def visitCreateTableUsing(ctx: CreateTableUsingContext): LogicalPlan = withOrigin(ctx) {
+    val (table, temp, ifNotExists, external) = visitCreateTableHeader(ctx.createTableHeader)
+    if (external) {
+      logWarning("EXTERNAL option is not supported.")
+    }
+    val options = Option(ctx.tablePropertyList).map(visitTablePropertyList).getOrElse(Map.empty)
+    val provider = ctx.tableProvider.qualifiedName.getText
+
+    if (ctx.query != null) {
+      // Get the backing query.
+      val query = plan(ctx.query)
+
+      // Determine the storage mode.
+      val mode = if (ifNotExists) {
+        SaveMode.Ignore
+      } else if (temp) {
+        SaveMode.Overwrite
+      } else {
+        SaveMode.ErrorIfExists
+      }
+      CreateTableUsingAsSelect(table, provider, temp, Array.empty, None, mode, options, query)
+    } else {
+      val struct = Option(ctx.colTypeList).map(createStructType)
+      CreateTableUsing(table, struct, provider, temp, options, ifNotExists, managedIfNoPath = false)
+    }
+  }
+
+  /**
+    * Convert a table property list into a key-value map.
+    */
+  override def visitTablePropertyList(
+      ctx: TablePropertyListContext): Map[String, String] = withOrigin(ctx) {
+    ctx.tableProperty.asScala.map { property =>
+      // A key can either be a String or a collection of dot separated elements. We need to treat
+      // these differently.
+      val key = if (property.key.STRING != null) {
+        string(property.key.STRING)
+      } else {
+        property.key.getText
+      }
+      val value = Option(property.value).map(string).orNull
+      key -> value
+    }.toMap
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 8abb9d7e4a..7ce15e3f35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.parser.CatalystQl
 import org.apache.spark.sql.catalyst.plans.logical.BroadcastHint
+import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -1172,8 +1172,7 @@ object functions {
    * @group normal_funcs
    */
   def expr(expr: String): Column = {
-    val parser = SQLContext.getActive().map(_.sessionState.sqlParser).getOrElse(new CatalystQl())
-    Column(parser.parseExpression(expr))
+    Column(SparkSqlParser.parseExpression(expr))
   }
 
   //////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index e5f02caabc..9bc640763f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -81,7 +81,7 @@ private[sql] class SessionState(ctx: SQLContext) {
   /**
    * Parser that extracts expressions, plans, table identifiers etc. from SQL texts.
    */
-  lazy val sqlParser: ParserInterface = new SparkQl(conf)
+  lazy val sqlParser: ParserInterface = SparkSqlParser
 
   /**
    * Planner that converts optimized logical plans to physical plans.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 5af1a4fcd7..a5a4ff13de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -329,8 +329,8 @@ class JoinSuite extends QueryTest with SharedSQLContext {
   }
 
   test("full outer join") {
-    upperCaseData.where('N <= 4).registerTempTable("left")
-    upperCaseData.where('N >= 3).registerTempTable("right")
+    upperCaseData.where('N <= 4).registerTempTable("`left`")
+    upperCaseData.where('N >= 3).registerTempTable("`right`")
 
     val left = UnresolvedRelation(TableIdentifier("left"), None)
     val right = UnresolvedRelation(TableIdentifier("right"), None)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index c958eac266..b727e88668 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1656,7 +1656,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     val e2 = intercept[AnalysisException] {
       sql("select interval 23 nanosecond")
     }
-    assert(e2.message.contains("cannot recognize input near"))
+    assert(e2.message.contains("No interval can be constructed"))
   }
 
   test("SPARK-8945: add and subtract expressions for interval type") {
author	Herman van Hovell <hvanhovell@questtec.nl>	2016-03-28 12:31:12 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-03-28 12:31:12 -0700
commit	600c0b69cab4767e8e5a6f4284777d8b9d4bd40e (patch)
tree	bae635ab17a8b58400127f20bbbe5acaecc92f98 /sql/core/src
parent	1528ff4c9affe1df103c4b3abd56a86c71d8b753 (diff)
download	spark-600c0b69cab4767e8e5a6f4284777d8b9d4bd40e.tar.gz spark-600c0b69cab4767e8e5a6f4284777d8b9d4bd40e.tar.bz2 spark-600c0b69cab4767e8e5a6f4284777d8b9d4bd40e.zip