[SPARK-3654][SQL] Implement all extended HiveQL statements/commands with a separate parser combinator

Created separate parser for hql. It preparses the commands like cache,uncache,add jar etc.. and then parses with HiveQl Author: ravipesala <ravindra.pesala@huawei.com> Closes #2590 from ravipesala/SPARK-3654 and squashes the following commits: bbca7dd [ravipesala] Fixed code as per admin comments. ae9290a [ravipesala] Fixed style issues as per Admin comments 898ed81 [ravipesala] Removed spaces fb24edf [ravipesala] Updated the code as per admin comments 8947d37 [ravipesala] Removed duplicate code ba26cd1 [ravipesala] Created seperate parser for hql.It pre parses the commands like cache,uncache,add jar etc.. and then parses with HiveQl
author: ravipesala <ravindra.pesala@huawei.com> 2014-10-02 20:04:33 -0700
committer: Michael Armbrust <michael@databricks.com> 2014-10-02 20:04:33 -0700
commit: 1c90347a4bba12df7b76d282a7dbac8e555e049f (patch)
tree: 2f752a98d80c375fffc861ba7b11f07792d5dcd0
parent: 7de4e50a01e90bcf88e0b721b2b15a5162373d56 (diff)
download: spark-1c90347a4bba12df7b76d282a7dbac8e555e049f.tar.gz
spark-1c90347a4bba12df7b76d282a7dbac8e555e049f.tar.bz2
spark-1c90347a4bba12df7b76d282a7dbac8e555e049f.zip
3 files changed, 154 insertions, 44 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
new file mode 100644
index 0000000000..e7e1cb980c
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/ExtendedHiveQlParser.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import scala.language.implicitConversions
+import scala.util.parsing.combinator.syntactical.StandardTokenParsers
+import scala.util.parsing.combinator.PackratParsers
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.SqlLexical
+
+/**
+ * A parser that recognizes all HiveQL constructs together with several Spark SQL specific 
+ * extensions like CACHE TABLE and UNCACHE TABLE.
+ */
+private[hive] class ExtendedHiveQlParser extends StandardTokenParsers with PackratParsers {  
+  
+  def apply(input: String): LogicalPlan = {
+    // Special-case out set commands since the value fields can be
+    // complex to handle without RegexParsers. Also this approach
+    // is clearer for the several possible cases of set commands.
+    if (input.trim.toLowerCase.startsWith("set")) {
+      input.trim.drop(3).split("=", 2).map(_.trim) match {
+        case Array("") => // "set"
+          SetCommand(None, None)
+        case Array(key) => // "set key"
+          SetCommand(Some(key), None)
+        case Array(key, value) => // "set key=value"
+          SetCommand(Some(key), Some(value))
+      }
+    } else if (input.trim.startsWith("!")) {
+      ShellCommand(input.drop(1))
+    } else {
+      phrase(query)(new lexical.Scanner(input)) match {
+        case Success(r, x) => r
+        case x => sys.error(x.toString)
+      }
+    }
+  }
+
+  protected case class Keyword(str: String)
+
+  protected val CACHE = Keyword("CACHE")
+  protected val SET = Keyword("SET")
+  protected val ADD = Keyword("ADD")
+  protected val JAR = Keyword("JAR")
+  protected val TABLE = Keyword("TABLE")
+  protected val AS = Keyword("AS")
+  protected val UNCACHE = Keyword("UNCACHE")
+  protected val FILE = Keyword("FILE")
+  protected val DFS = Keyword("DFS")
+  protected val SOURCE = Keyword("SOURCE")
+
+  protected implicit def asParser(k: Keyword): Parser[String] =
+    lexical.allCaseVersions(k.str).map(x => x : Parser[String]).reduce(_ | _)
+
+  protected def allCaseConverse(k: String): Parser[String] =
+    lexical.allCaseVersions(k).map(x => x : Parser[String]).reduce(_ | _)
+
+  protected val reservedWords =
+    this.getClass
+      .getMethods
+      .filter(_.getReturnType == classOf[Keyword])
+      .map(_.invoke(this).asInstanceOf[Keyword].str)
+
+  override val lexical = new SqlLexical(reservedWords)
+
+  protected lazy val query: Parser[LogicalPlan] = 
+    cache | uncache | addJar | addFile | dfs | source | hiveQl
+
+  protected lazy val hiveQl: Parser[LogicalPlan] =
+    remainingQuery ^^ {
+      case r => HiveQl.createPlan(r.trim())
+    }
+
+  /** It returns all remaining query */
+  protected lazy val remainingQuery: Parser[String] = new Parser[String] {
+    def apply(in: Input) =
+      Success(
+        in.source.subSequence(in.offset, in.source.length).toString,
+        in.drop(in.source.length()))
+  }
+
+  /** It returns all query */
+  protected lazy val allQuery: Parser[String] = new Parser[String] {
+    def apply(in: Input) =
+      Success(in.source.toString, in.drop(in.source.length()))
+  }
+
+  protected lazy val cache: Parser[LogicalPlan] =
+    CACHE ~ TABLE ~> ident ~ opt(AS ~> hiveQl) ^^ {
+      case tableName ~ None => CacheCommand(tableName, true)
+      case tableName ~ Some(plan) =>
+        CacheTableAsSelectCommand(tableName, plan)
+    }
+
+  protected lazy val uncache: Parser[LogicalPlan] =
+    UNCACHE ~ TABLE ~> ident ^^ {
+      case tableName => CacheCommand(tableName, false)
+    }
+
+  protected lazy val addJar: Parser[LogicalPlan] =
+    ADD ~ JAR ~> remainingQuery ^^ {
+      case rq => AddJar(rq.trim())
+    }
+
+  protected lazy val addFile: Parser[LogicalPlan] =
+    ADD ~ FILE ~> remainingQuery ^^ {
+      case rq => AddFile(rq.trim())
+    }
+
+  protected lazy val dfs: Parser[LogicalPlan] =
+    DFS ~> allQuery ^^ {
+      case aq => NativeCommand(aq.trim())
+    }
+
+  protected lazy val source: Parser[LogicalPlan] =
+    SOURCE ~> remainingQuery ^^ {
+      case rq => SourceCommand(rq.trim())
+    }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 4f3f808c93..6bb42eeb05 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -126,6 +126,9 @@ private[hive] object HiveQl {
     "TOK_CREATETABLE",
     "TOK_DESCTABLE"
   ) ++ nativeCommands
+  
+  // It parses hive sql query along with with several Spark SQL specific extensions
+  protected val hiveSqlParser = new ExtendedHiveQlParser
 
   /**
    * A set of implicit transformations that allow Hive ASTNodes to be rewritten by transformations
@@ -215,40 +218,19 @@ private[hive] object HiveQl {
   def getAst(sql: String): ASTNode = ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql))
 
   /** Returns a LogicalPlan for a given HiveQL string. */
-  def parseSql(sql: String): LogicalPlan = {
+  def parseSql(sql: String): LogicalPlan = hiveSqlParser(sql)
+
+  /** Creates LogicalPlan for a given HiveQL string. */
+  def createPlan(sql: String) = {
     try {
-      if (sql.trim.toLowerCase.startsWith("set")) {
-        // Split in two parts since we treat the part before the first "="
-        // as key, and the part after as value, which may contain other "=" signs.
-        sql.trim.drop(3).split("=", 2).map(_.trim) match {
-          case Array("") => // "set"
-            SetCommand(None, None)
-          case Array(key) => // "set key"
-            SetCommand(Some(key), None)
-          case Array(key, value) => // "set key=value"
-            SetCommand(Some(key), Some(value))
-        }
-      } else if (sql.trim.toLowerCase.startsWith("cache table")) {
-        sql.trim.drop(12).trim.split(" ").toSeq match {
-          case Seq(tableName) => 
-            CacheCommand(tableName, true)
-          case Seq(tableName, _, select @ _*) => 
-            CacheTableAsSelectCommand(tableName, createPlan(select.mkString(" ").trim))
-        }
-      } else if (sql.trim.toLowerCase.startsWith("uncache table")) {
-        CacheCommand(sql.trim.drop(14).trim, false)
-      } else if (sql.trim.toLowerCase.startsWith("add jar")) {
-        AddJar(sql.trim.drop(8).trim)
-      } else if (sql.trim.toLowerCase.startsWith("add file")) {
-        AddFile(sql.trim.drop(9))
-      } else if (sql.trim.toLowerCase.startsWith("dfs")) {
+      val tree = getAst(sql)
+      if (nativeCommands contains tree.getText) {
         NativeCommand(sql)
-      } else if (sql.trim.startsWith("source")) {
-        SourceCommand(sql.split(" ").toSeq match { case Seq("source", filePath) => filePath })
-      } else if (sql.trim.startsWith("!")) {
-        ShellCommand(sql.drop(1))
       } else {
-        createPlan(sql)
+        nodeToPlan(tree) match {
+          case NativePlaceholder => NativeCommand(sql)
+          case other => other
+        }
       }
     } catch {
       case e: Exception => throw new ParseException(sql, e)
@@ -259,19 +241,6 @@ private[hive] object HiveQl {
         """.stripMargin)
     }
   }
-  
-  /** Creates LogicalPlan for a given HiveQL string. */
-  def createPlan(sql: String) = {
-    val tree = getAst(sql)
-    if (nativeCommands contains tree.getText) {
-      NativeCommand(sql)
-    } else {
-      nodeToPlan(tree) match {
-        case NativePlaceholder => NativeCommand(sql)
-        case other => other
-      }
-    }
-  }
 
   def parseDdl(ddl: String): Seq[Attribute] = {
     val tree =
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
index 188579edd7..b3057cd618 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala
@@ -88,4 +88,10 @@ class CachedTableSuite extends HiveComparisonTest {
     }
     assert(!TestHive.isCached("src"), "Table 'src' should not be cached")
   }
+  
+ test("'CACHE TABLE tableName AS SELECT ..'") {
+    TestHive.sql("CACHE TABLE testCacheTable AS SELECT * FROM src")
+    assert(TestHive.isCached("testCacheTable"), "Table 'testCacheTable' should be cached")
+    TestHive.uncacheTable("testCacheTable")
+  }  
 }
author	ravipesala <ravindra.pesala@huawei.com>	2014-10-02 20:04:33 -0700
committer	Michael Armbrust <michael@databricks.com>	2014-10-02 20:04:33 -0700
commit	1c90347a4bba12df7b76d282a7dbac8e555e049f (patch)
tree	2f752a98d80c375fffc861ba7b11f07792d5dcd0
parent	7de4e50a01e90bcf88e0b721b2b15a5162373d56 (diff)
download	spark-1c90347a4bba12df7b76d282a7dbac8e555e049f.tar.gz spark-1c90347a4bba12df7b76d282a7dbac8e555e049f.tar.bz2 spark-1c90347a4bba12df7b76d282a7dbac8e555e049f.zip