diff options
author | Yin Huai <yhuai@databricks.com> | 2016-04-29 22:49:12 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-04-29 22:49:12 -0700 |
commit | ac41fc648de584f08863313fbac0c5bb6fc6a65e (patch) | |
tree | f72da33e155967bda250166f9083940bbe32c845 /sql | |
parent | 7945f9f6d431453a192bea66f66fec813913e4c8 (diff) | |
download | spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.gz spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.tar.bz2 spark-ac41fc648de584f08863313fbac0c5bb6fc6a65e.zip |
[SPARK-14591][SQL] Remove DataTypeParser and add more keywords to the nonReserved list.
## What changes were proposed in this pull request?
CatalystSqlParser can parse data types. So, we do not need to have an individual DataTypeParser.
## How was this patch tested?
Existing tests
Author: Yin Huai <yhuai@databricks.com>
Closes #12796 from yhuai/removeDataTypeParser.
Diffstat (limited to 'sql')
9 files changed, 26 insertions, 232 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 5efaf8f201..3851e4c706 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} @@ -189,7 +189,7 @@ case class SimpleCatalogRelation( (cols ++ catalogTable.partitionColumns).map { f => AttributeReference( f.name, - DataTypeParser.parse(f.dataType), + CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true )(qualifier = Some(alias.getOrElse(metadata.identifier.table))) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala deleted file mode 100644 index 0eb13c600c..0000000000 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/DataTypeParser.scala +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.catalyst.parser - -import scala.language.implicitConversions -import scala.util.matching.Regex -import scala.util.parsing.combinator.syntactical.StandardTokenParsers -import scala.util.parsing.input.CharArrayReader._ - -import org.apache.spark.sql.types._ - -/** - * This is a data type parser that can be used to parse string representations of data types - * provided in SQL queries. This parser is mixed in with DDLParser and SqlParser. - */ -private[sql] trait DataTypeParser extends StandardTokenParsers { - - // This is used to create a parser from a regex. We are using regexes for data type strings - // since these strings can be also used as column names or field names. - import lexical.Identifier - implicit def regexToParser(regex: Regex): Parser[String] = acceptMatch( - s"identifier matching regex ${regex}", - { case Identifier(str) if regex.unapplySeq(str).isDefined => str } - ) - - protected lazy val primitiveType: Parser[DataType] = - "(?i)string".r ^^^ StringType | - "(?i)float".r ^^^ FloatType | - "(?i)(?:int|integer)".r ^^^ IntegerType | - "(?i)tinyint".r ^^^ ByteType | - "(?i)smallint".r ^^^ ShortType | - "(?i)double".r ^^^ DoubleType | - "(?i)(?:bigint|long)".r ^^^ LongType | - "(?i)binary".r ^^^ BinaryType | - "(?i)boolean".r ^^^ BooleanType | - fixedDecimalType | - "(?i)decimal".r ^^^ DecimalType.USER_DEFAULT | - "(?i)date".r ^^^ DateType | - "(?i)timestamp".r ^^^ TimestampType | - varchar | - char - - protected lazy val fixedDecimalType: Parser[DataType] = - ("(?i)decimal".r ~> "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ { - case precision ~ scale => - DecimalType(precision.toInt, scale.toInt) - } - - protected lazy val char: Parser[DataType] = - "(?i)char".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType - - protected lazy val varchar: Parser[DataType] = - "(?i)varchar".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType - - protected lazy val arrayType: Parser[DataType] = - "(?i)array".r ~> "<" ~> dataType <~ ">" ^^ { - case tpe => ArrayType(tpe) - } - - protected lazy val mapType: Parser[DataType] = - "(?i)map".r ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { - case t1 ~ _ ~ t2 => MapType(t1, t2) - } - - protected lazy val structField: Parser[StructField] = - ident ~ ":" ~ dataType ^^ { - case name ~ _ ~ tpe => StructField(name, tpe, nullable = true) - } - - protected lazy val structType: Parser[DataType] = - ("(?i)struct".r ~> "<" ~> repsep(structField, ",") <~ ">" ^^ { - case fields => new StructType(fields.toArray) - }) | - ("(?i)struct".r ~ "<>" ^^^ StructType(Nil)) - - protected lazy val dataType: Parser[DataType] = - arrayType | - mapType | - structType | - primitiveType - - def toDataType(dataTypeString: String): DataType = synchronized { - phrase(dataType)(new lexical.Scanner(dataTypeString)) match { - case Success(result, _) => result - case failure: NoSuccess => throw new DataTypeException(failMessage(dataTypeString)) - } - } - - private def failMessage(dataTypeString: String): String = { - s"Unsupported dataType: $dataTypeString. If you have a struct and a field name of it has " + - "any special characters, please use backticks (`) to quote that field name, e.g. `x+y`. " + - "Please note that backtick itself is not supported in a field name." - } -} - -private[sql] object DataTypeParser { - lazy val dataTypeParser = new DataTypeParser { - override val lexical = new SqlLexical - } - - def parse(dataTypeString: String): DataType = dataTypeParser.toDataType(dataTypeString) -} - -/** The exception thrown from the [[DataTypeParser]]. */ -private[sql] class DataTypeException(message: String) extends Exception(message) - -class SqlLexical extends scala.util.parsing.combinator.lexical.StdLexical { - case class DecimalLit(chars: String) extends Token { - override def toString: String = chars - } - - /* This is a work around to support the lazy setting */ - def initialize(keywords: Seq[String]): Unit = { - reserved.clear() - reserved ++= keywords - } - - /* Normal the keyword string */ - def normalizeKeyword(str: String): String = str.toLowerCase - - delimiters += ( - "@", "*", "+", "-", "<", "=", "<>", "!=", "<=", "!>", ">=", "!<", ">", "/", "(", ")", - ",", ";", "%", "{", "}", ":", "[", "]", ".", "&", "|", "^", "~", "<=>" - ) - - protected override def processIdent(name: String) = { - val token = normalizeKeyword(name) - if (reserved contains token) Keyword(token) else Identifier(name) - } - - override lazy val token: Parser[Token] = - ( rep1(digit) ~ scientificNotation ^^ { case i ~ s => DecimalLit(i.mkString + s) } - | '.' ~> (rep1(digit) ~ scientificNotation) ^^ - { case i ~ s => DecimalLit("0." + i.mkString + s) } - | rep1(digit) ~ ('.' ~> digit.*) ~ scientificNotation ^^ - { case i1 ~ i2 ~ s => DecimalLit(i1.mkString + "." + i2.mkString + s) } - | digit.* ~ identChar ~ (identChar | digit).* ^^ - { case first ~ middle ~ rest => processIdent((first ++ (middle :: rest)).mkString) } - | rep1(digit) ~ ('.' ~> digit.*).? ^^ { - case i ~ None => NumericLit(i.mkString) - case i ~ Some(d) => DecimalLit(i.mkString + "." + d.mkString) - } - | '\'' ~> chrExcept('\'', '\n', EofCh).* <~ '\'' ^^ - { case chars => StringLit(chars mkString "") } - | '"' ~> chrExcept('"', '\n', EofCh).* <~ '"' ^^ - { case chars => StringLit(chars mkString "") } - | '`' ~> chrExcept('`', '\n', EofCh).* <~ '`' ^^ - { case chars => Identifier(chars mkString "") } - | EofCh ^^^ EOF - | '\'' ~> failure("unclosed string literal") - | '"' ~> failure("unclosed string literal") - | delim - | failure("illegal character") - ) - - override def identChar: Parser[Elem] = letter | elem('_') - - private lazy val scientificNotation: Parser[String] = - (elem('e') | elem('E')) ~> (elem('+') | elem('-')).? ~ rep1(digit) ^^ { - case s ~ rest => "e" + s.mkString + rest.mkString - } - - override def whitespace: Parser[Any] = - ( whitespaceChar - | '/' ~ '*' ~ comment - | '/' ~ '/' ~ chrExcept(EofCh, '\n').* - | '#' ~ chrExcept(EofCh, '\n').* - | '-' ~ '-' ~ chrExcept(EofCh, '\n').* - | '/' ~ '*' ~ failure("unclosed comment") - ).* -} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala index 6f50be7a99..b06aa7bc52 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -25,7 +25,7 @@ import org.json4s.JsonDSL._ import org.apache.spark.SparkException import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, InterpretedOrdering} -import org.apache.spark.sql.catalyst.parser.{DataTypeParser, LegacyTypeStringParser} +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, LegacyTypeStringParser} import org.apache.spark.sql.catalyst.util.quoteIdentifier /** @@ -169,7 +169,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * }}} */ def add(name: String, dataType: String): StructType = { - add(name, DataTypeParser.parse(dataType), nullable = true, Metadata.empty) + add(name, CatalystSqlParser.parseDataType(dataType), nullable = true, Metadata.empty) } /** @@ -184,7 +184,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru * }}} */ def add(name: String, dataType: String, nullable: Boolean): StructType = { - add(name, DataTypeParser.parse(dataType), nullable, Metadata.empty) + add(name, CatalystSqlParser.parseDataType(dataType), nullable, Metadata.empty) } /** @@ -202,7 +202,7 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru dataType: String, nullable: Boolean, metadata: Metadata): StructType = { - add(name, DataTypeParser.parse(dataType), nullable, metadata) + add(name, CatalystSqlParser.parseDataType(dataType), nullable, metadata) } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala index 07b89cb61f..40782978a7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DataTypeParserSuite.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types._ -abstract class AbstractDataTypeParserSuite extends SparkFunSuite { +class CatalystQlDataTypeParserSuite extends SparkFunSuite { - def parse(sql: String): DataType + def parse(sql: String): DataType = CatalystSqlParser.parseDataType(sql) def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = { test(s"parse ${dataTypeString.replace("\n", "")}") { @@ -30,7 +30,8 @@ abstract class AbstractDataTypeParserSuite extends SparkFunSuite { } } - def intercept(sql: String) + def intercept(sql: String): Unit = + intercept[ParseException](CatalystSqlParser.parseDataType(sql)) def unsupported(dataTypeString: String): Unit = { test(s"$dataTypeString is not supported") { @@ -115,38 +116,16 @@ abstract class AbstractDataTypeParserSuite extends SparkFunSuite { unsupported("it is not a data type") unsupported("struct<x+y: int, 1.1:timestamp>") unsupported("struct<x: int") -} - -class DataTypeParserSuite extends AbstractDataTypeParserSuite { - override def intercept(sql: String): Unit = - intercept[DataTypeException](DataTypeParser.parse(sql)) - override def parse(sql: String): DataType = - DataTypeParser.parse(sql) - - // A column name can be a reserved word in our DDL parser and SqlParser. + // DataType parser accepts certain reserved keywords. checkDataType( - "Struct<TABLE: string, CASE:boolean>", + "Struct<TABLE: string, DATE:boolean>", StructType( StructField("TABLE", StringType, true) :: - StructField("CASE", BooleanType, true) :: Nil) + StructField("DATE", BooleanType, true) :: Nil) ) - unsupported("struct<x int, y string>") - - unsupported("struct<`x``y` int>") -} - -class CatalystQlDataTypeParserSuite extends AbstractDataTypeParserSuite { - override def intercept(sql: String): Unit = - intercept[ParseException](CatalystSqlParser.parseDataType(sql)) - - override def parse(sql: String): DataType = - CatalystSqlParser.parseDataType(sql) - - // A column name can be a reserved word in our DDL parser and SqlParser. - unsupported("Struct<TABLE: string, CASE:boolean>") - + // Define struct columns without ':' checkDataType( "struct<x int, y string>", (new StructType).add("x", IntegerType).add("y", StringType)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index bd96941da7..c58addaf90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.usePrettyExpression import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression import org.apache.spark.sql.functions.lit @@ -979,7 +979,7 @@ class Column(protected[sql] val expr: Expression) extends Logging { * @group expr_ops * @since 1.3.0 */ - def cast(to: String): Column = cast(DataTypeParser.parse(to)) + def cast(to: String): Column = cast(CatalystSqlParser.parseDataType(to)) /** * Returns an ordering used in sorting. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 5b580d0ef9..1671228fd9 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference, Expression} -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.hive.client.HiveClient @@ -188,7 +188,7 @@ private[hive] case class MetastoreRelation( implicit class SchemaAttribute(f: CatalogColumn) { def toAttribute: AttributeReference = AttributeReference( f.name, - DataTypeParser.parse(f.dataType), + CatalystSqlParser.parseDataType(f.dataType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true )(qualifier = Some(alias.getOrElse(tableName))) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala index da7b73ae64..13d2bed606 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala @@ -24,7 +24,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { @@ -78,7 +78,7 @@ private[orc] object OrcFileOperator extends Logging { val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") - DataTypeParser.parse(schema).asInstanceOf[StructType] + CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 3665bb48e3..b043d291aa 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -22,7 +22,7 @@ import java.io.File import org.apache.spark.sql.{QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils} @@ -33,7 +33,7 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton { test("struct field should accept underscore in sub-column name") { val hiveTypeStr = "struct<a: int, b_1: string, c: string>" - val dateType = DataTypeParser.parse(hiveTypeStr) + val dateType = CatalystSqlParser.parseDataType(hiveTypeStr) assert(dateType.isInstanceOf[StructType]) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index d6c98ea619..c3a9f2479c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.parser.DataTypeParser +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -922,7 +922,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv // As a proxy for verifying that the table was stored in Hive compatible format, // we verify that each column of the table is of native type StringType. assert(sharedState.externalCatalog.getTable("default", "not_skip_hive_metadata").schema - .forall(column => DataTypeParser.parse(column.dataType) == StringType)) + .forall(column => CatalystSqlParser.parseDataType(column.dataType) == StringType)) CreateDataSourceTableUtils.createDataSourceTable( sparkSession = sqlContext.sparkSession, @@ -937,7 +937,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv // As a proxy for verifying that the table was stored in SparkSQL format, // we verify that the table has a column type as array of StringType. assert(sharedState.externalCatalog.getTable("default", "skip_hive_metadata") - .schema.forall { c => DataTypeParser.parse(c.dataType) == ArrayType(StringType) }) + .schema.forall { c => + CatalystSqlParser.parseDataType(c.dataType) == ArrayType(StringType) }) } } |