[SPARK-3654][SQL] Unifies SQL and HiveQL parsers

This PR is a follow up of #2590, and tries to introduce a top level SQL parser entry point for all SQL dialects supported by Spark SQL. A top level parser `SparkSQLParser` is introduced to handle the syntaxes that all SQL dialects should recognize (e.g. `CACHE TABLE`, `UNCACHE TABLE` and `SET`, etc.). For all the syntaxes this parser doesn't recognize directly, it fallbacks to a specified function that tries to parse arbitrary input to a `LogicalPlan`. This function is typically another parser combinator like `SqlParser`. DDL syntaxes introduced in #2475 can be moved to here. The `ExtendedHiveQlParser` now only handle Hive specific extensions. Also took the chance to refactor/reformat `SqlParser` for better readability. Author: Cheng Lian <lian.cs.zju@gmail.com> Closes #2698 from liancheng/gen-sql-parser and squashes the following commits: ceada76 [Cheng Lian] Minor styling fixes 9738934 [Cheng Lian] Minor refactoring, removes optional trailing ";" in the parser bb2ab12 [Cheng Lian] SET property value can be empty string ce8860b [Cheng Lian] Passes test suites e86968e [Cheng Lian] Removes debugging code 8bcace5 [Cheng Lian] Replaces digit.+ to rep1(digit) (Scala style checking doesn't like it) d15d54f [Cheng Lian] Unifies SQL and HiveQL parsers
author: Cheng Lian <lian.cs.zju@gmail.com> 2014-10-09 18:25:06 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-10-09 18:25:06 -0700
commit: edf02da389f75df5a42465d41f035d6b65599848 (patch)
tree: 1530a6ae85e804ff8ed0498e69a971bc569ee5ca /sql/hive
parent: 363baacaded56047bcc63276d729ab911e0336cf (diff)
download: spark-edf02da389f75df5a42465d41f035d6b65599848.tar.gz
spark-edf02da389f75df5a42465d41f035d6b65599848.tar.bz2
spark-edf02da389f75df5a42465d41f035d6b65599848.zip
3 files changed, 32 insertions, 101 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
index c5844e92ea..430ffb2998 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
@@ -18,118 +18,50 @@
 package org.apache.spark.sql.hive
 
 import scala.language.implicitConversions
-import scala.util.parsing.combinator.syntactical.StandardTokenParsers
-import scala.util.parsing.combinator.PackratParsers
+
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.SqlLexical
+import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, SqlLexical}
 
 /**
- * A parser that recognizes all HiveQL constructs together with several Spark SQL specific
- * extensions like CACHE TABLE and UNCACHE TABLE.
+ * A parser that recognizes all HiveQL constructs together with Spark SQL specific extensions.
  */
-private[hive] class ExtendedHiveQlParser extends StandardTokenParsers with PackratParsers {
-
-  def apply(input: String): LogicalPlan = {
-    // Special-case out set commands since the value fields can be
-    // complex to handle without RegexParsers. Also this approach
-    // is clearer for the several possible cases of set commands.
-    if (input.trim.toLowerCase.startsWith("set")) {
-      input.trim.drop(3).split("=", 2).map(_.trim) match {
-        case Array("") => // "set"
-          SetCommand(None, None)
-        case Array(key) => // "set key"
-          SetCommand(Some(key), None)
-        case Array(key, value) => // "set key=value"
-          SetCommand(Some(key), Some(value))
-      }
-    } else if (input.trim.startsWith("!")) {
-      ShellCommand(input.drop(1))
-    } else {
-      phrase(query)(new lexical.Scanner(input)) match {
-        case Success(r, x) => r
-        case x => sys.error(x.toString)
-      }
-    }
-  }
-
-  protected case class Keyword(str: String)
-
-  protected val ADD = Keyword("ADD")
-  protected val AS = Keyword("AS")
-  protected val CACHE = Keyword("CACHE")
-  protected val DFS = Keyword("DFS")
-  protected val FILE = Keyword("FILE")
-  protected val JAR = Keyword("JAR")
-  protected val LAZY = Keyword("LAZY")
-  protected val SET = Keyword("SET")
-  protected val SOURCE = Keyword("SOURCE")
-  protected val TABLE = Keyword("TABLE")
-  protected val UNCACHE = Keyword("UNCACHE")
-
+private[hive] class ExtendedHiveQlParser extends AbstractSparkSQLParser {
   protected implicit def asParser(k: Keyword): Parser[String] =
     lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
 
-  protected def allCaseConverse(k: String): Parser[String] =
-    lexical.allCaseVersions(k).map(x => x : Parser[String]).reduce(_ | _)
+  protected val ADD  = Keyword("ADD")
+  protected val DFS  = Keyword("DFS")
+  protected val FILE = Keyword("FILE")
+  protected val JAR  = Keyword("JAR")
 
-  protected val reservedWords =
-    this.getClass
+  private val reservedWords =
+    this
+      .getClass
       .getMethods
       .filter(_.getReturnType == classOf[Keyword])
       .map(_.invoke(this).asInstanceOf[Keyword].str)
 
   override val lexical = new SqlLexical(reservedWords)
 
-  protected lazy val query: Parser[LogicalPlan] =
-    cache | uncache | addJar | addFile | dfs | source | hiveQl
+  protected lazy val start: Parser[LogicalPlan] = dfs | addJar | addFile | hiveQl
 
   protected lazy val hiveQl: Parser[LogicalPlan] =
     restInput ^^ {
-      case statement => HiveQl.createPlan(statement.trim())
-    }
-
-  // Returns the whole input string
-  protected lazy val wholeInput: Parser[String] = new Parser[String] {
-    def apply(in: Input) =
-      Success(in.source.toString, in.drop(in.source.length()))
-  }
-
-  // Returns the rest of the input string that are not parsed yet
-  protected lazy val restInput: Parser[String] = new Parser[String] {
-    def apply(in: Input) =
-      Success(
-        in.source.subSequence(in.offset, in.source.length).toString,
-        in.drop(in.source.length()))
-  }
-
-  protected lazy val cache: Parser[LogicalPlan] =
-    CACHE ~> opt(LAZY) ~ (TABLE ~> ident) ~ opt(AS ~> hiveQl) ^^ {
-      case isLazy ~ tableName ~ plan =>
-        CacheTableCommand(tableName, plan, isLazy.isDefined)
-    }
-
-  protected lazy val uncache: Parser[LogicalPlan] =
-    UNCACHE ~ TABLE ~> ident ^^ {
-      case tableName => UncacheTableCommand(tableName)
+      case statement => HiveQl.createPlan(statement.trim)
     }
 
-  protected lazy val addJar: Parser[LogicalPlan] =
-    ADD ~ JAR ~> restInput ^^ {
-      case jar => AddJar(jar.trim())
+  protected lazy val dfs: Parser[LogicalPlan] =
+    DFS ~> wholeInput ^^ {
+      case command => NativeCommand(command.trim)
     }
 
-  protected lazy val addFile: Parser[LogicalPlan] =
+  private lazy val addFile: Parser[LogicalPlan] =
     ADD ~ FILE ~> restInput ^^ {
-      case file => AddFile(file.trim())
+      case input => AddFile(input.trim)
     }
 
-  protected lazy val dfs: Parser[LogicalPlan] =
-    DFS ~> wholeInput ^^ {
-      case command => NativeCommand(command.trim())
-    }
-
-  protected lazy val source: Parser[LogicalPlan] =
-    SOURCE ~> restInput ^^ {
-      case file => SourceCommand(file.trim())
+  private lazy val addJar: Parser[LogicalPlan] =
+    ADD ~ JAR ~> restInput ^^ {
+      case input => AddJar(input.trim)
     }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 32c9175f18..98a46a31e1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -21,6 +21,7 @@ import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
 
+import org.apache.spark.sql.catalyst.SparkSQLParser
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
@@ -38,10 +39,6 @@ import scala.collection.JavaConversions._
  */
 private[hive] case object NativePlaceholder extends Command
 
-private[hive] case class ShellCommand(cmd: String) extends Command
-
-private[hive] case class SourceCommand(filePath: String) extends Command
-
 private[hive] case class AddFile(filePath: String) extends Command
 
 private[hive] case class AddJar(path: String) extends Command
@@ -126,9 +123,11 @@ private[hive] object HiveQl {
     "TOK_CREATETABLE",
     "TOK_DESCTABLE"
   ) ++ nativeCommands
-  
-  // It parses hive sql query along with with several Spark SQL specific extensions
-  protected val hiveSqlParser = new ExtendedHiveQlParser
+
+  protected val hqlParser = {
+    val fallback = new ExtendedHiveQlParser
+    new SparkSQLParser(fallback(_))
+  }
 
   /**
    * A set of implicit transformations that allow Hive ASTNodes to be rewritten by transformations
@@ -218,7 +217,7 @@ private[hive] object HiveQl {
   def getAst(sql: String): ASTNode = ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql))
 
   /** Returns a LogicalPlan for a given HiveQL string. */
-  def parseSql(sql: String): LogicalPlan = hiveSqlParser(sql)
+  def parseSql(sql: String): LogicalPlan = hqlParser(sql)
 
   /** Creates LogicalPlan for a given HiveQL string. */
   def createPlan(sql: String) = {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 508d8239c7..5c66322f1e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -167,10 +167,10 @@ private[hive] trait HiveStrategies {
           database.get,
           tableName,
           query,
-          InsertIntoHiveTable(_: MetastoreRelation, 
-            Map(), 
-            query, 
-            true)(hiveContext)) :: Nil
+          InsertIntoHiveTable(_: MetastoreRelation,
+            Map(),
+            query,
+            overwrite = true)(hiveContext)) :: Nil
       case _ => Nil
     }
   }
author	Cheng Lian <lian.cs.zju@gmail.com>	2014-10-09 18:25:06 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-10-09 18:25:06 -0700
commit	edf02da389f75df5a42465d41f035d6b65599848 (patch)
tree	1530a6ae85e804ff8ed0498e69a971bc569ee5ca /sql/hive
parent	363baacaded56047bcc63276d729ab911e0336cf (diff)
download	spark-edf02da389f75df5a42465d41f035d6b65599848.tar.gz spark-edf02da389f75df5a42465d41f035d6b65599848.tar.bz2 spark-edf02da389f75df5a42465d41f035d6b65599848.zip