aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorYin Huai <yhuai@databricks.com>2015-03-21 13:27:53 -0700
committerMichael Armbrust <michael@databricks.com>2015-03-21 13:27:53 -0700
commit94a102acb80a7c77f57409ece1f8dbbba791b774 (patch)
tree334a3c7028af526bfd1954e05eb4e1148cfdb8ab /sql
parentee569a0c7171d149eee52877def902378eaf695e (diff)
downloadspark-94a102acb80a7c77f57409ece1f8dbbba791b774.tar.gz
spark-94a102acb80a7c77f57409ece1f8dbbba791b774.tar.bz2
spark-94a102acb80a7c77f57409ece1f8dbbba791b774.zip
[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.
This PR creates a trait `DataTypeParser` used to parse data types. This trait aims to be single place to provide the functionality of parsing data types' string representation. It is currently mixed in with `DDLParser` and `SqlParser`. It is also used to parse the data type for `DataFrame.cast` and to convert Hive metastore's data type string back to a `DataType`. JIRA: https://issues.apache.org/jira/browse/SPARK-6250 Author: Yin Huai <yhuai@databricks.com> Closes #5078 from yhuai/ddlKeywords and squashes the following commits: 0e66097 [Yin Huai] Special handle struct<>. fea6012 [Yin Huai] Style. c9733fb [Yin Huai] Create a trait to parse data types.
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala27
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala115
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala116
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Column.scala15
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala80
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala8
6 files changed, 241 insertions, 120 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 54ab13ca35..ea7d44a372 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.types._
* This is currently included mostly for illustrative purposes. Users wanting more complete support
* for a SQL like language should checkout the HiveQL support in the sql/hive sub-project.
*/
-class SqlParser extends AbstractSparkSQLParser {
+class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
def parseExpression(input: String): Expression = {
// Initialize the Keywords.
@@ -61,11 +61,8 @@ class SqlParser extends AbstractSparkSQLParser {
protected val CAST = Keyword("CAST")
protected val COALESCE = Keyword("COALESCE")
protected val COUNT = Keyword("COUNT")
- protected val DATE = Keyword("DATE")
- protected val DECIMAL = Keyword("DECIMAL")
protected val DESC = Keyword("DESC")
protected val DISTINCT = Keyword("DISTINCT")
- protected val DOUBLE = Keyword("DOUBLE")
protected val ELSE = Keyword("ELSE")
protected val END = Keyword("END")
protected val EXCEPT = Keyword("EXCEPT")
@@ -78,7 +75,6 @@ class SqlParser extends AbstractSparkSQLParser {
protected val IF = Keyword("IF")
protected val IN = Keyword("IN")
protected val INNER = Keyword("INNER")
- protected val INT = Keyword("INT")
protected val INSERT = Keyword("INSERT")
protected val INTERSECT = Keyword("INTERSECT")
protected val INTO = Keyword("INTO")
@@ -105,13 +101,11 @@ class SqlParser extends AbstractSparkSQLParser {
protected val SELECT = Keyword("SELECT")
protected val SEMI = Keyword("SEMI")
protected val SQRT = Keyword("SQRT")
- protected val STRING = Keyword("STRING")
protected val SUBSTR = Keyword("SUBSTR")
protected val SUBSTRING = Keyword("SUBSTRING")
protected val SUM = Keyword("SUM")
protected val TABLE = Keyword("TABLE")
protected val THEN = Keyword("THEN")
- protected val TIMESTAMP = Keyword("TIMESTAMP")
protected val TRUE = Keyword("TRUE")
protected val UNION = Keyword("UNION")
protected val UPPER = Keyword("UPPER")
@@ -315,7 +309,9 @@ class SqlParser extends AbstractSparkSQLParser {
)
protected lazy val cast: Parser[Expression] =
- CAST ~ "(" ~> expression ~ (AS ~> dataType) <~ ")" ^^ { case exp ~ t => Cast(exp, t) }
+ CAST ~ "(" ~> expression ~ (AS ~> dataType) <~ ")" ^^ {
+ case exp ~ t => Cast(exp, t)
+ }
protected lazy val literal: Parser[Literal] =
( numericLiteral
@@ -387,19 +383,4 @@ class SqlParser extends AbstractSparkSQLParser {
(ident <~ ".") ~ ident ~ rep("." ~> ident) ^^ {
case i1 ~ i2 ~ rest => UnresolvedAttribute((Seq(i1, i2) ++ rest).mkString("."))
}
-
- protected lazy val dataType: Parser[DataType] =
- ( STRING ^^^ StringType
- | TIMESTAMP ^^^ TimestampType
- | DOUBLE ^^^ DoubleType
- | fixedDecimalType
- | DECIMAL ^^^ DecimalType.Unlimited
- | DATE ^^^ DateType
- | INT ^^^ IntegerType
- )
-
- protected lazy val fixedDecimalType: Parser[DataType] =
- (DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
- case precision ~ scale => DecimalType(precision.toInt, scale.toInt)
- }
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
new file mode 100644
index 0000000000..89278f7dbc
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataTypeParser.scala
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import scala.language.implicitConversions
+import scala.util.matching.Regex
+import scala.util.parsing.combinator.syntactical.StandardTokenParsers
+
+import org.apache.spark.sql.catalyst.SqlLexical
+
+/**
+ * This is a data type parser that can be used to parse string representations of data types
+ * provided in SQL queries. This parser is mixed in with DDLParser and SqlParser.
+ */
+private[sql] trait DataTypeParser extends StandardTokenParsers {
+
+ // This is used to create a parser from a regex. We are using regexes for data type strings
+ // since these strings can be also used as column names or field names.
+ import lexical.Identifier
+ implicit def regexToParser(regex: Regex): Parser[String] = acceptMatch(
+ s"identifier matching regex ${regex}",
+ { case Identifier(str) if regex.unapplySeq(str).isDefined => str }
+ )
+
+ protected lazy val primitiveType: Parser[DataType] =
+ "(?i)string".r ^^^ StringType |
+ "(?i)float".r ^^^ FloatType |
+ "(?i)int".r ^^^ IntegerType |
+ "(?i)tinyint".r ^^^ ByteType |
+ "(?i)smallint".r ^^^ ShortType |
+ "(?i)double".r ^^^ DoubleType |
+ "(?i)bigint".r ^^^ LongType |
+ "(?i)binary".r ^^^ BinaryType |
+ "(?i)boolean".r ^^^ BooleanType |
+ fixedDecimalType |
+ "(?i)decimal".r ^^^ DecimalType.Unlimited |
+ "(?i)date".r ^^^ DateType |
+ "(?i)timestamp".r ^^^ TimestampType |
+ varchar
+
+ protected lazy val fixedDecimalType: Parser[DataType] =
+ ("(?i)decimal".r ~> "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
+ case precision ~ scale =>
+ DecimalType(precision.toInt, scale.toInt)
+ }
+
+ protected lazy val varchar: Parser[DataType] =
+ "(?i)varchar".r ~> "(" ~> (numericLit <~ ")") ^^^ StringType
+
+ protected lazy val arrayType: Parser[DataType] =
+ "(?i)array".r ~> "<" ~> dataType <~ ">" ^^ {
+ case tpe => ArrayType(tpe)
+ }
+
+ protected lazy val mapType: Parser[DataType] =
+ "(?i)map".r ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
+ case t1 ~ _ ~ t2 => MapType(t1, t2)
+ }
+
+ protected lazy val structField: Parser[StructField] =
+ ident ~ ":" ~ dataType ^^ {
+ case name ~ _ ~ tpe => StructField(name, tpe, nullable = true)
+ }
+
+ protected lazy val structType: Parser[DataType] =
+ ("(?i)struct".r ~> "<" ~> repsep(structField, ",") <~ ">" ^^ {
+ case fields => new StructType(fields.toArray)
+ }) |
+ ("(?i)struct".r ~ "<>" ^^^ StructType(Nil))
+
+ protected lazy val dataType: Parser[DataType] =
+ arrayType |
+ mapType |
+ structType |
+ primitiveType
+
+ def toDataType(dataTypeString: String): DataType = synchronized {
+ phrase(dataType)(new lexical.Scanner(dataTypeString)) match {
+ case Success(result, _) => result
+ case failure: NoSuccess => throw new DataTypeException(failMessage(dataTypeString))
+ }
+ }
+
+ private def failMessage(dataTypeString: String): String = {
+ s"Unsupported dataType: $dataTypeString. If you have a struct and a field name of it has " +
+ "any special characters, please use backticks (`) to quote that field name, e.g. `x+y`. " +
+ "Please note that backtick itself is not supported in a field name."
+ }
+}
+
+private[sql] object DataTypeParser {
+ lazy val dataTypeParser = new DataTypeParser {
+ override val lexical = new SqlLexical
+ }
+
+ def apply(dataTypeString: String): DataType = dataTypeParser.toDataType(dataTypeString)
+}
+
+/** The exception thrown from the [[DataTypeParser]]. */
+protected[sql] class DataTypeException(message: String) extends Exception(message)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
new file mode 100644
index 0000000000..1ba21b6460
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DataTypeParserSuite.scala
@@ -0,0 +1,116 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.types
+
+import org.scalatest.FunSuite
+
+class DataTypeParserSuite extends FunSuite {
+
+ def checkDataType(dataTypeString: String, expectedDataType: DataType): Unit = {
+ test(s"parse ${dataTypeString.replace("\n", "")}") {
+ assert(DataTypeParser(dataTypeString) === expectedDataType)
+ }
+ }
+
+ def unsupported(dataTypeString: String): Unit = {
+ test(s"$dataTypeString is not supported") {
+ intercept[DataTypeException](DataTypeParser(dataTypeString))
+ }
+ }
+
+ checkDataType("int", IntegerType)
+ checkDataType("BooLean", BooleanType)
+ checkDataType("tinYint", ByteType)
+ checkDataType("smallINT", ShortType)
+ checkDataType("INT", IntegerType)
+ checkDataType("bigint", LongType)
+ checkDataType("float", FloatType)
+ checkDataType("dOUBle", DoubleType)
+ checkDataType("decimal(10, 5)", DecimalType(10, 5))
+ checkDataType("decimal", DecimalType.Unlimited)
+ checkDataType("DATE", DateType)
+ checkDataType("timestamp", TimestampType)
+ checkDataType("string", StringType)
+ checkDataType("varchAr(20)", StringType)
+ checkDataType("BINARY", BinaryType)
+
+ checkDataType("array<doublE>", ArrayType(DoubleType, true))
+ checkDataType("Array<map<int, tinYint>>", ArrayType(MapType(IntegerType, ByteType, true), true))
+ checkDataType(
+ "array<struct<tinYint:tinyint>>",
+ ArrayType(StructType(StructField("tinYint", ByteType, true) :: Nil), true)
+ )
+ checkDataType("MAP<int, STRING>", MapType(IntegerType, StringType, true))
+ checkDataType("MAp<int, ARRAY<double>>", MapType(IntegerType, ArrayType(DoubleType), true))
+ checkDataType(
+ "MAP<int, struct<varchar:string>>",
+ MapType(IntegerType, StructType(StructField("varchar", StringType, true) :: Nil), true)
+ )
+
+ checkDataType(
+ "struct<intType: int, ts:timestamp>",
+ StructType(
+ StructField("intType", IntegerType, true) ::
+ StructField("ts", TimestampType, true) :: Nil)
+ )
+ // It is fine to use the data type string as the column name.
+ checkDataType(
+ "Struct<int: int, timestamp:timestamp>",
+ StructType(
+ StructField("int", IntegerType, true) ::
+ StructField("timestamp", TimestampType, true) :: Nil)
+ )
+ checkDataType(
+ """
+ |struct<
+ | struct:struct<deciMal:DECimal, anotherDecimal:decimAL(5,2)>,
+ | MAP:Map<timestamp, varchar(10)>,
+ | arrAy:Array<double>>
+ """.stripMargin,
+ StructType(
+ StructField("struct",
+ StructType(
+ StructField("deciMal", DecimalType.Unlimited, true) ::
+ StructField("anotherDecimal", DecimalType(5, 2), true) :: Nil), true) ::
+ StructField("MAP", MapType(TimestampType, StringType), true) ::
+ StructField("arrAy", ArrayType(DoubleType, true), true) :: Nil)
+ )
+ // A column name can be a reserved word in our DDL parser and SqlParser.
+ checkDataType(
+ "Struct<TABLE: string, CASE:boolean>",
+ StructType(
+ StructField("TABLE", StringType, true) ::
+ StructField("CASE", BooleanType, true) :: Nil)
+ )
+ // Use backticks to quote column names having special characters.
+ checkDataType(
+ "struct<`x+y`:int, `!@#$%^&*()`:string, `1_2.345<>:\"`:varchar(20)>",
+ StructType(
+ StructField("x+y", IntegerType, true) ::
+ StructField("!@#$%^&*()", StringType, true) ::
+ StructField("1_2.345<>:\"", StringType, true) :: Nil)
+ )
+ // Empty struct.
+ checkDataType("strUCt<>", StructType(Nil))
+
+ unsupported("it is not a data type")
+ unsupported("struct<x+y: int, 1.1:timestamp>")
+ unsupported("struct<x: int")
+ unsupported("struct<x int, y string>")
+ unsupported("struct<`x``y` int>")
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index b7a13a1b26..ec7d15f5bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -624,20 +624,7 @@ class Column(protected[sql] val expr: Expression) {
*
* @group expr_ops
*/
- def cast(to: String): Column = cast(to.toLowerCase match {
- case "string" | "str" => StringType
- case "boolean" => BooleanType
- case "byte" => ByteType
- case "short" => ShortType
- case "int" => IntegerType
- case "long" => LongType
- case "float" => FloatType
- case "double" => DoubleType
- case "decimal" => DecimalType.Unlimited
- case "date" => DateType
- case "timestamp" => TimestampType
- case _ => throw new RuntimeException(s"""Unsupported cast type: "$to"""")
- })
+ def cast(to: String): Column = cast(DataTypeParser(to))
/**
* Returns an ordering used in sorting.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index d57406645e..d2e807d3a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -34,7 +34,8 @@ import org.apache.spark.util.Utils
* A parser for foreign DDL commands.
*/
private[sql] class DDLParser(
- parseQuery: String => LogicalPlan) extends AbstractSparkSQLParser with Logging {
+ parseQuery: String => LogicalPlan)
+ extends AbstractSparkSQLParser with DataTypeParser with Logging {
def apply(input: String, exceptionOnError: Boolean): Option[LogicalPlan] = {
try {
@@ -46,14 +47,6 @@ private[sql] class DDLParser(
}
}
- def parseType(input: String): DataType = {
- lexical.initialize(reservedWords)
- phrase(dataType)(new lexical.Scanner(input)) match {
- case Success(r, x) => r
- case x => throw new DDLException(s"Unsupported dataType: $x")
- }
- }
-
// Keyword is a convention with AbstractSparkSQLParser, which will scan all of the `Keyword`
// properties via reflection the class in runtime for constructing the SqlLexical object
protected val CREATE = Keyword("CREATE")
@@ -70,24 +63,6 @@ private[sql] class DDLParser(
protected val COMMENT = Keyword("COMMENT")
protected val REFRESH = Keyword("REFRESH")
- // Data types.
- protected val STRING = Keyword("STRING")
- protected val BINARY = Keyword("BINARY")
- protected val BOOLEAN = Keyword("BOOLEAN")
- protected val TINYINT = Keyword("TINYINT")
- protected val SMALLINT = Keyword("SMALLINT")
- protected val INT = Keyword("INT")
- protected val BIGINT = Keyword("BIGINT")
- protected val FLOAT = Keyword("FLOAT")
- protected val DOUBLE = Keyword("DOUBLE")
- protected val DECIMAL = Keyword("DECIMAL")
- protected val DATE = Keyword("DATE")
- protected val TIMESTAMP = Keyword("TIMESTAMP")
- protected val VARCHAR = Keyword("VARCHAR")
- protected val ARRAY = Keyword("ARRAY")
- protected val MAP = Keyword("MAP")
- protected val STRUCT = Keyword("STRUCT")
-
protected lazy val ddl: Parser[LogicalPlan] = createTable | describeTable | refreshTable
protected def start: Parser[LogicalPlan] = ddl
@@ -189,58 +164,9 @@ private[sql] class DDLParser(
new MetadataBuilder().putString(COMMENT.str.toLowerCase, comment).build()
case None => Metadata.empty
}
- StructField(columnName, typ, nullable = true, meta)
- }
-
- protected lazy val primitiveType: Parser[DataType] =
- STRING ^^^ StringType |
- BINARY ^^^ BinaryType |
- BOOLEAN ^^^ BooleanType |
- TINYINT ^^^ ByteType |
- SMALLINT ^^^ ShortType |
- INT ^^^ IntegerType |
- BIGINT ^^^ LongType |
- FLOAT ^^^ FloatType |
- DOUBLE ^^^ DoubleType |
- fixedDecimalType | // decimal with precision/scale
- DECIMAL ^^^ DecimalType.Unlimited | // decimal with no precision/scale
- DATE ^^^ DateType |
- TIMESTAMP ^^^ TimestampType |
- VARCHAR ~ "(" ~ numericLit ~ ")" ^^^ StringType
-
- protected lazy val fixedDecimalType: Parser[DataType] =
- (DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ {
- case precision ~ scale => DecimalType(precision.toInt, scale.toInt)
- }
-
- protected lazy val arrayType: Parser[DataType] =
- ARRAY ~> "<" ~> dataType <~ ">" ^^ {
- case tpe => ArrayType(tpe)
- }
- protected lazy val mapType: Parser[DataType] =
- MAP ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
- case t1 ~ _ ~ t2 => MapType(t1, t2)
- }
-
- protected lazy val structField: Parser[StructField] =
- ident ~ ":" ~ dataType ^^ {
- case fieldName ~ _ ~ tpe => StructField(fieldName, tpe, nullable = true)
+ StructField(columnName, typ, nullable = true, meta)
}
-
- protected lazy val structType: Parser[DataType] =
- (STRUCT ~> "<" ~> repsep(structField, ",") <~ ">" ^^ {
- case fields => StructType(fields)
- }) |
- (STRUCT ~> "<>" ^^ {
- case fields => StructType(Nil)
- })
-
- private[sql] lazy val dataType: Parser[DataType] =
- arrayType |
- mapType |
- structType |
- primitiveType
}
private[sql] object ResolvedDataSource {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index fe86bd206a..949a4e54e6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -756,7 +756,7 @@ private[hive] case class MetastoreRelation
implicit class SchemaAttribute(f: FieldSchema) {
def toAttribute = AttributeReference(
f.getName,
- sqlContext.ddlParser.parseType(f.getType),
+ HiveMetastoreTypes.toDataType(f.getType),
// Since data can be dumped in randomly with no validation, everything is nullable.
nullable = true
)(qualifiers = Seq(alias.getOrElse(tableName)))
@@ -779,11 +779,7 @@ private[hive] case class MetastoreRelation
private[hive] object HiveMetastoreTypes {
- protected val ddlParser = new DDLParser(HiveQl.parseSql(_))
-
- def toDataType(metastoreType: String): DataType = synchronized {
- ddlParser.parseType(metastoreType)
- }
+ def toDataType(metastoreType: String): DataType = DataTypeParser(metastoreType)
def toMetastoreType(dt: DataType): String = dt match {
case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"