[SPARK-14591][SQL] Remove DataTypeParser and add more keywords to the nonReserved list.

## What changes were proposed in this pull request? CatalystSqlParser can parse data types. So, we do not need to have an individual DataTypeParser. ## How was this patch tested? Existing tests Author: Yin Huai <yhuai@databricks.com> Closes #12796 from yhuai/removeDataTypeParser.
author: Yin Huai <yhuai@databricks.com> 2016-04-29 22:49:12 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-04-29 22:49:12 -0700
commit: ac41fc648de584f08863313fbac0c5bb6fc6a65e (patch)
tree: f72da33e155967bda250166f9083940bbe32c845 /sql/catalyst/src
parent: 7945f9f6d431453a192bea66f66fec813913e4c8 (diff)
download: spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.gz
spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.bz2
spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.zip
4 files changed, 14 insertions, 221 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
index 5efaf8f201..3851e4c706 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -22,7 +22,7 @@ import javax.annotation.Nullable
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
-import org.apache.spark.sql.catalyst.parser.DataTypeParser
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan}
 
 
@@ -189,7 +189,7 @@ case class SimpleCatalogRelation(
     (cols ++ catalogTable.partitionColumns).map { f =>
       AttributeReference(
         f.name,
-        DataTypeParser.parse(f.dataType),
+        CatalystSqlParser.parseDataType(f.dataType),
         // Since data can be dumped in randomly with no validation, everything is nullable.
         nullable = true
       )(qualifier = Some(alias.getOrElse(metadata.identifier.table)))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala
deleted file mode 100644
index 0eb13c600c..0000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.parser
-
-import scala.language.implicitConversions
-import scala.util.matching.Regex
-import scala.util.parsing.combinator.syntactical.StandardTokenParsers
-import scala.util.parsing.input.CharArrayReader._
-
-import org.apache.spark.sql.types._
-
-/**
- * This is a data type parser that can be used to parse string representations of data types
- * provided in SQL queries. This parser is mixed in with DDLParser and SqlParser.
- */
-private[sql] trait DataTypeParser extends StandardTokenParsers {
-
-  // This is used to create a parser from a regex. We are using regexes for data type strings
-  // since these strings can be also used as column names or field names.
-  import lexical.Identifier
-  implicit def regexToParser(regex: Regex): Parser[String] = acceptMatch(
-    s"identifier matching regex ${regex}",
-    { case Identifier(str) if regex.unapplySeq(str).isDefined => str }
-  )
-
-  protected lazy val primitiveType: Parser[DataType] =
-    "(?i)string".r ^^^ StringType |
-    "(?i)float".r ^^^ FloatType |
-    "(?i)(?:int|integer)".r ^^^ IntegerType |
-    "(?i)tinyint".r ^^^ ByteType |
-    "(?i)smallint".r ^^^ ShortType |
-    "(?i)double".r ^^^ DoubleType |
-    "(?i)(?:bigint|long)".r ^^^ LongType |
-    "(?i)binary".r ^^^ BinaryType |
-    "(?i)boolean".r ^^^ BooleanType |
-    fixedDecimalType |
-    "(?i)decimal".r ^^^ DecimalType.USER_DEFAULT |
-    "(?i)date".r ^^^ DateType |
-    "(?i)timestamp".r ^^^ TimestampType |
-    varchar |
-    char
-
-  protected lazy val fixedDecimalType: Parser[DataType] =
-    ("(?i)decimal".r ~> "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
-      case precision ~ scale =>
-        DecimalType(precision.toInt, scale.toInt)
-    }
-
-  protected lazy val char: Parser[DataType] =
-    "(?i)char".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
-
-  protected lazy val varchar: Parser[DataType] =
-    "(?i)varchar".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
-
-  protected lazy val arrayType: Parser[DataType] =
-    "(?i)array".r ~> "<" ~> dataType <~ ">" ^^ {
-      case tpe => ArrayType(tpe)
-    }
-
-  protected lazy val mapType: Parser[DataType] =
-    "(?i)map".r ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
-      case t1 ~ _ ~ t2 => MapType(t1, t2)
-    }
-
-  protected lazy val structField: Parser[StructField] =
-    ident ~ ":" ~ dataType ^^ {
-      case name ~ _ ~ tpe => StructField(name, tpe, nullable = true)
-    }
-
-  protected lazy val structType: Parser[DataType] =
-    ("(?i)struct".r ~> "<" ~> repsep(structField, ",") <~ ">"  ^^ {
-      case fields => new StructType(fields.toArray)
-    }) |
-    ("(?i)struct".r ~ "<>" ^^^ StructType(Nil))
-
-  protected lazy val dataType: Parser[DataType] =
-    arrayType |
-    mapType |
-    structType |
-    primitiveType
-
-  def toDataType(dataTypeString: String): DataType = synchronized {
-    phrase(dataType)(new lexical.Scanner(dataTypeString)) match {
-      case Success(result, _) => result
-      case failure: NoSuccess => throw new DataTypeException(failMessage(dataTypeString))
-    }
-  }
-
-  private def failMessage(dataTypeString: String): String = {
-    s"Unsupported dataType: $dataTypeString. If you have a struct and a field name of it has " +
-      "any special characters, please use backticks (`) to quote that field name, e.g. `x+y`. " +
-      "Please note that backtick itself is not supported in a field name."
-  }
-}
-
-private[sql] object DataTypeParser {
-  lazy val dataTypeParser = new DataTypeParser {
-    override val lexical = new SqlLexical
-  }
-
-  def parse(dataTypeString: String): DataType = dataTypeParser.toDataType(dataTypeString)
-}
-
-/** The exception thrown from the [[DataTypeParser]]. */
-private[sql] class DataTypeException(message: String) extends Exception(message)
-
-class SqlLexical extends scala.util.parsing.combinator.lexical.StdLexical {
-  case class DecimalLit(chars: String) extends Token {
-    override def toString: String = chars
-  }
-
-  /* This is a work around to support the lazy setting */
-  def initialize(keywords: Seq[String]): Unit = {
-    reserved.clear()
-    reserved ++= keywords
-  }
-
-  /* Normal the keyword string */
-  def normalizeKeyword(str: String): String = str.toLowerCase
-
-  delimiters += (
-    "@", "*", "+", "-", "<", "=", "<>", "!=", "<=", "!>", ">=", "!<", ">", "/", "(", ")",
-    ",", ";", "%", "{", "}", ":", "[", "]", ".", "&", "|", "^", "~", "<=>"
-    )
-
-  protected override def processIdent(name: String) = {
-    val token = normalizeKeyword(name)
-    if (reserved contains token) Keyword(token) else Identifier(name)
-  }
-
-  override lazy val token: Parser[Token] =
-    ( rep1(digit) ~ scientificNotation ^^ { case i ~ s => DecimalLit(i.mkString + s) }
-      | '.' ~> (rep1(digit) ~ scientificNotation) ^^
-      { case i ~ s => DecimalLit("0." + i.mkString + s) }
-      | rep1(digit) ~ ('.' ~> digit.*) ~ scientificNotation ^^
-      { case i1 ~ i2 ~ s => DecimalLit(i1.mkString + "." + i2.mkString + s) }
-      | digit.* ~ identChar ~ (identChar | digit).* ^^
-      { case first ~ middle ~ rest => processIdent((first ++ (middle :: rest)).mkString) }
-      | rep1(digit) ~ ('.' ~> digit.*).? ^^ {
-      case i ~ None => NumericLit(i.mkString)
-      case i ~ Some(d) => DecimalLit(i.mkString + "." + d.mkString)
-    }
-      | '\'' ~> chrExcept('\'', '\n', EofCh).* <~ '\'' ^^
-      { case chars => StringLit(chars mkString "") }
-      | '"' ~> chrExcept('"', '\n', EofCh).* <~ '"' ^^
-      { case chars => StringLit(chars mkString "") }
-      | '`' ~> chrExcept('`', '\n', EofCh).* <~ '`' ^^
-      { case chars => Identifier(chars mkString "") }
-      | EofCh ^^^ EOF
-      | '\'' ~> failure("unclosed string literal")
-      | '"' ~> failure("unclosed string literal")
-      | delim
-      | failure("illegal character")
-      )
-
-  override def identChar: Parser[Elem] = letter | elem('_')
-
-  private lazy val scientificNotation: Parser[String] =
-    (elem('e') | elem('E')) ~> (elem('+') | elem('-')).? ~ rep1(digit) ^^ {
-      case s ~ rest => "e" + s.mkString + rest.mkString
-    }
-
-  override def whitespace: Parser[Any] =
-    ( whitespaceChar
-      | '/' ~ '*' ~ comment
-      | '/' ~ '/' ~ chrExcept(EofCh, '\n').*
-      | '#' ~ chrExcept(EofCh, '\n').*
-      | '-' ~ '-' ~ chrExcept(EofCh, '\n').*
-      | '/' ~ '*' ~ failure("unclosed comment")
-      ).*
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
index 6f50be7a99..b06aa7bc52 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -25,7 +25,7 @@ import org.json4s.JsonDSL._
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering}
-import org.apache.spark.sql.catalyst.parser.{DataTypeParser, LegacyTypeStringParser}
+import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, LegacyTypeStringParser}
 import org.apache.spark.sql.catalyst.util.quoteIdentifier
 
 /**
@@ -169,7 +169,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
    * }}}
    */
   def add(name: String, dataType: String): StructType = {
-    add(name, DataTypeParser.parse(dataType), nullable = true, Metadata.empty)
+    add(name, CatalystSqlParser.parseDataType(dataType), nullable = true, Metadata.empty)
   }
 
   /**
@@ -184,7 +184,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
    * }}}
    */
   def add(name: String, dataType: String, nullable: Boolean): StructType = {
-    add(name, DataTypeParser.parse(dataType), nullable, Metadata.empty)
+    add(name, CatalystSqlParser.parseDataType(dataType), nullable, Metadata.empty)
   }
 
   /**
@@ -202,7 +202,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
       dataType: String,
       nullable: Boolean,
       metadata: Metadata): StructType = {
-    add(name, DataTypeParser.parse(dataType), nullable, metadata)
+    add(name, CatalystSqlParser.parseDataType(dataType), nullable, metadata)
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
index 07b89cb61f..40782978a7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.parser
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.types._
 
-abstract class AbstractDataTypeParserSuite extends SparkFunSuite {
+class CatalystQlDataTypeParserSuite extends SparkFunSuite {
 
-  def parse(sql: String): DataType
+  def parse(sql: String): DataType = CatalystSqlParser.parseDataType(sql)
 
   def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = {
     test(s"parse ${dataTypeString.replace("\n", "")}") {
@@ -30,7 +30,8 @@ abstract class AbstractDataTypeParserSuite extends SparkFunSuite {
     }
   }
 
-  def intercept(sql: String)
+  def intercept(sql: String): Unit =
+    intercept[ParseException](CatalystSqlParser.parseDataType(sql))
 
   def unsupported(dataTypeString: String): Unit = {
     test(s"$dataTypeString is not supported") {
@@ -115,38 +116,16 @@ abstract class AbstractDataTypeParserSuite extends SparkFunSuite {
   unsupported("it is not a data type")
   unsupported("struct<x+y: int, 1.1:timestamp>")
   unsupported("struct<x: int")
-}
-
-class DataTypeParserSuite extends AbstractDataTypeParserSuite {
-  override def intercept(sql: String): Unit =
-    intercept[DataTypeException](DataTypeParser.parse(sql))
 
-  override def parse(sql: String): DataType =
-    DataTypeParser.parse(sql)
-
-  // A column name can be a reserved word in our DDL parser and SqlParser.
+  // DataType parser accepts certain reserved keywords.
   checkDataType(
-    "Struct<TABLE: string, CASE:boolean>",
+    "Struct<TABLE: string, DATE:boolean>",
     StructType(
       StructField("TABLE", StringType, true) ::
-        StructField("CASE", BooleanType, true) :: Nil)
+        StructField("DATE", BooleanType, true) :: Nil)
   )
 
-  unsupported("struct<x int, y string>")
-
-  unsupported("struct<`x``y` int>")
-}
-
-class CatalystQlDataTypeParserSuite extends AbstractDataTypeParserSuite {
-  override def intercept(sql: String): Unit =
-    intercept[ParseException](CatalystSqlParser.parseDataType(sql))
-
-  override def parse(sql: String): DataType =
-    CatalystSqlParser.parseDataType(sql)
-
-  // A column name can be a reserved word in our DDL parser and SqlParser.
-  unsupported("Struct<TABLE: string, CASE:boolean>")
-
+  // Define struct columns without ':'
   checkDataType(
     "struct<x int, y string>",
     (new StructType).add("x", IntegerType).add("y", StringType))
author	Yin Huai <yhuai@databricks.com>	2016-04-29 22:49:12 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-04-29 22:49:12 -0700
commit	ac41fc648de584f08863313fbac0c5bb6fc6a65e (patch)
tree	f72da33e155967bda250166f9083940bbe32c845 /sql/catalyst/src
parent	7945f9f6d431453a192bea66f66fec813913e4c8 (diff)
download	spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.gz spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.bz2 spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.zip