diff options
author | Yin Huai <yhuai@databricks.com> | 2016-04-29 22:49:12 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-04-29 22:49:12 -0700 |
commit | ac41fc648de584f08863313fbac0c5bb6fc6a65e (patch) | |
tree | f72da33e155967bda250166f9083940bbe32c845 /sql/catalyst/src | |
parent | 7945f9f6d431453a192bea66f66fec813913e4c8 (diff) | |
download | spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.gz spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.bz2 spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.zip |
[SPARK-14591][SQL] Remove DataTypeParser and add more keywords to the nonReserved list.
## What changes were proposed in this pull request?
CatalystSqlParser can parse data types. So, we do not need to have an individual DataTypeParser.
## How was this patch tested?
Existing tests
Author: Yin Huai <yhuai@databricks.com>
Closes #12796 from yhuai/removeDataTypeParser.
Diffstat (limited to 'sql/catalyst/src')
4 files changed, 14 insertions, 221 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 5efaf8f201..3851e4c706 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} @@ -189,7 +189,7 @@ case class SimpleCatalogRelation( (cols ++ catalogTable.partitionColumns).map { f => AttributeReference( f.name, - DataTypeParser.parse(f.dataType), + CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true )(qualifier = Some(alias.getOrElse(metadata.identifier.table))) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala deleted file mode 100644 index 0eb13c600c..0000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.parser - -import scala.language.implicitConversions -import scala.util.matching.Regex -import scala.util.parsing.combinator.syntactical.StandardTokenParsers -import scala.util.parsing.input.CharArrayReader._ - -import org.apache.spark.sql.types._ - -/** - * This is a data type parser that can be used to parse string representations of data types - * provided in SQL queries. This parser is mixed in with DDLParser and SqlParser. - */ -private[sql] trait DataTypeParser extends StandardTokenParsers { - - // This is used to create a parser from a regex. We are using regexes for data type strings - // since these strings can be also used as column names or field names. - import lexical.Identifier - implicit def regexToParser(regex: Regex): Parser[String] = acceptMatch( - s"identifier matching regex ${regex}", - { case Identifier(str) if regex.unapplySeq(str).isDefined => str } - ) - - protected lazy val primitiveType: Parser[DataType] = - "(?i)string".r ^^^ StringType | - "(?i)float".r ^^^ FloatType | - "(?i)(?:int|integer)".r ^^^ IntegerType | - "(?i)tinyint".r ^^^ ByteType | - "(?i)smallint".r ^^^ ShortType | - "(?i)double".r ^^^ DoubleType | - "(?i)(?:bigint|long)".r ^^^ LongType | - "(?i)binary".r ^^^ BinaryType | - "(?i)boolean".r ^^^ BooleanType | - fixedDecimalType | - "(?i)decimal".r ^^^ DecimalType.USER_DEFAULT | - "(?i)date".r ^^^ DateType | - "(?i)timestamp".r ^^^ TimestampType | - varchar | - char - - protected lazy val fixedDecimalType: Parser[DataType] = - ("(?i)decimal".r ~> "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ { - case precision ~ scale => - DecimalType(precision.toInt, scale.toInt) - } - - protected lazy val char: Parser[DataType] = - "(?i)char".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType - - protected lazy val varchar: Parser[DataType] = - "(?i)varchar".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType - - protected lazy val arrayType: Parser[DataType] = - "(?i)array".r ~> "<" ~> dataType <~ ">" ^^ { - case tpe => ArrayType(tpe) - } - - protected lazy val mapType: Parser[DataType] = - "(?i)map".r ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { - case t1 ~ _ ~ t2 => MapType(t1, t2) - } - - protected lazy val structField: Parser[StructField] = - ident ~ ":" ~ dataType ^^ { - case name ~ _ ~ tpe => StructField(name, tpe, nullable = true) - } - - protected lazy val structType: Parser[DataType] = - ("(?i)struct".r ~> "<" ~> repsep(structField, ",") <~ ">" ^^ { - case fields => new StructType(fields.toArray) - }) | - ("(?i)struct".r ~ "<>" ^^^ StructType(Nil)) - - protected lazy val dataType: Parser[DataType] = - arrayType | - mapType | - structType | - primitiveType - - def toDataType(dataTypeString: String): DataType = synchronized { - phrase(dataType)(new lexical.Scanner(dataTypeString)) match { - case Success(result, _) => result - case failure: NoSuccess => throw new DataTypeException(failMessage(dataTypeString)) - } - } - - private def failMessage(dataTypeString: String): String = { - s"Unsupported dataType: $dataTypeString. If you have a struct and a field name of it has " + - "any special characters, please use backticks (`) to quote that field name, e.g. `x+y`. " + - "Please note that backtick itself is not supported in a field name." - } -} - -private[sql] object DataTypeParser { - lazy val dataTypeParser = new DataTypeParser { - override val lexical = new SqlLexical - } - - def parse(dataTypeString: String): DataType = dataTypeParser.toDataType(dataTypeString) -} - -/** The exception thrown from the [[DataTypeParser]]. */ -private[sql] class DataTypeException(message: String) extends Exception(message) - -class SqlLexical extends scala.util.parsing.combinator.lexical.StdLexical { - case class DecimalLit(chars: String) extends Token { - override def toString: String = chars - } - - /* This is a work around to support the lazy setting */ - def initialize(keywords: Seq[String]): Unit = { - reserved.clear() - reserved ++= keywords - } - - /* Normal the keyword string */ - def normalizeKeyword(str: String): String = str.toLowerCase - - delimiters += ( - "@", "*", "+", "-", "<", "=", "<>", "!=", "<=", "!>", ">=", "!<", ">", "/", "(", ")", - ",", ";", "%", "{", "}", ":", "[", "]", ".", "&", "|", "^", "~", "<=>" - ) - - protected override def processIdent(name: String) = { - val token = normalizeKeyword(name) - if (reserved contains token) Keyword(token) else Identifier(name) - } - - override lazy val token: Parser[Token] = - ( rep1(digit) ~ scientificNotation ^^ { case i ~ s => DecimalLit(i.mkString + s) } - | '.' ~> (rep1(digit) ~ scientificNotation) ^^ - { case i ~ s => DecimalLit("0." + i.mkString + s) } - | rep1(digit) ~ ('.' ~> digit.*) ~ scientificNotation ^^ - { case i1 ~ i2 ~ s => DecimalLit(i1.mkString + "." + i2.mkString + s) } - | digit.* ~ identChar ~ (identChar | digit).* ^^ - { case first ~ middle ~ rest => processIdent((first ++ (middle :: rest)).mkString) } - | rep1(digit) ~ ('.' ~> digit.*).? ^^ { - case i ~ None => NumericLit(i.mkString) - case i ~ Some(d) => DecimalLit(i.mkString + "." + d.mkString) - } - | '\'' ~> chrExcept('\'', '\n', EofCh).* <~ '\'' ^^ - { case chars => StringLit(chars mkString "") } - | '"' ~> chrExcept('"', '\n', EofCh).* <~ '"' ^^ - { case chars => StringLit(chars mkString "") } - | '`' ~> chrExcept('`', '\n', EofCh).* <~ '`' ^^ - { case chars => Identifier(chars mkString "") } - | EofCh ^^^ EOF - | '\'' ~> failure("unclosed string literal") - | '"' ~> failure("unclosed string literal") - | delim - | failure("illegal character") - ) - - override def identChar: Parser[Elem] = letter | elem('_') - - private lazy val scientificNotation: Parser[String] = - (elem('e') | elem('E')) ~> (elem('+') | elem('-')).? ~ rep1(digit) ^^ { - case s ~ rest => "e" + s.mkString + rest.mkString - } - - override def whitespace: Parser[Any] = - ( whitespaceChar - | '/' ~ '*' ~ comment - | '/' ~ '/' ~ chrExcept(EofCh, '\n').* - | '#' ~ chrExcept(EofCh, '\n').* - | '-' ~ '-' ~ chrExcept(EofCh, '\n').* - | '/' ~ '*' ~ failure("unclosed comment") - ).* -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala index 6f50be7a99..b06aa7bc52 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -25,7 +25,7 @@ import org.json4s.JsonDSL._ import org.apache.spark.SparkException import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering} -import org.apache.spark.sql.catalyst.parser.{DataTypeParser, LegacyTypeStringParser} +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, LegacyTypeStringParser} import org.apache.spark.sql.catalyst.util.quoteIdentifier /** @@ -169,7 +169,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * }}} */ def add(name: String, dataType: String): StructType = { - add(name, DataTypeParser.parse(dataType), nullable = true, Metadata.empty) + add(name, CatalystSqlParser.parseDataType(dataType), nullable = true, Metadata.empty) } /** @@ -184,7 +184,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * }}} */ def add(name: String, dataType: String, nullable: Boolean): StructType = { - add(name, DataTypeParser.parse(dataType), nullable, Metadata.empty) + add(name, CatalystSqlParser.parseDataType(dataType), nullable, Metadata.empty) } /** @@ -202,7 +202,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru dataType: String, nullable: Boolean, metadata: Metadata): StructType = { - add(name, DataTypeParser.parse(dataType), nullable, metadata) + add(name, CatalystSqlParser.parseDataType(dataType), nullable, metadata) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala index 07b89cb61f..40782978a7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types._ -abstract class AbstractDataTypeParserSuite extends SparkFunSuite { +class CatalystQlDataTypeParserSuite extends SparkFunSuite { - def parse(sql: String): DataType + def parse(sql: String): DataType = CatalystSqlParser.parseDataType(sql) def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = { test(s"parse ${dataTypeString.replace("\n", "")}") { @@ -30,7 +30,8 @@ abstract class AbstractDataTypeParserSuite extends SparkFunSuite { } } - def intercept(sql: String) + def intercept(sql: String): Unit = + intercept[ParseException](CatalystSqlParser.parseDataType(sql)) def unsupported(dataTypeString: String): Unit = { test(s"$dataTypeString is not supported") { @@ -115,38 +116,16 @@ abstract class AbstractDataTypeParserSuite extends SparkFunSuite { unsupported("it is not a data type") unsupported("struct<x+y: int, 1.1:timestamp>") unsupported("struct<x: int") -} - -class DataTypeParserSuite extends AbstractDataTypeParserSuite { - override def intercept(sql: String): Unit = - intercept[DataTypeException](DataTypeParser.parse(sql)) - override def parse(sql: String): DataType = - DataTypeParser.parse(sql) - - // A column name can be a reserved word in our DDL parser and SqlParser. + // DataType parser accepts certain reserved keywords. checkDataType( - "Struct<TABLE: string, CASE:boolean>", + "Struct<TABLE: string, DATE:boolean>", StructType( StructField("TABLE", StringType, true) :: - StructField("CASE", BooleanType, true) :: Nil) + StructField("DATE", BooleanType, true) :: Nil) ) - unsupported("struct<x int, y string>") - - unsupported("struct<`x``y` int>") -} - -class CatalystQlDataTypeParserSuite extends AbstractDataTypeParserSuite { - override def intercept(sql: String): Unit = - intercept[ParseException](CatalystSqlParser.parseDataType(sql)) - - override def parse(sql: String): DataType = - CatalystSqlParser.parseDataType(sql) - - // A column name can be a reserved word in our DDL parser and SqlParser. - unsupported("Struct<TABLE: string, CASE:boolean>") - + // Define struct columns without ':' checkDataType( "struct<x int, y string>", (new StructType).add("x", IntegerType).add("y", StringType)) |