diff options
author | scwf <wangfei1@huawei.com> | 2015-01-10 13:53:21 -0800 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2015-01-10 13:53:21 -0800 |
commit | 693a323a70aba91e6c100dd5561d218a75b7895e (patch) | |
tree | 3604ec22163d5296496d1d1907e6b3edbafc0108 /sql/hive | |
parent | 4b39fd1e63188821fc84a13f7ccb6e94277f4be7 (diff) | |
download | spark-693a323a70aba91e6c100dd5561d218a75b7895e.tar.gz spark-693a323a70aba91e6c100dd5561d218a75b7895e.tar.bz2 spark-693a323a70aba91e6c100dd5561d218a75b7895e.zip |
[SPARK-4574][SQL] Adding support for defining schema in foreign DDL commands.
Adding support for defining schema in foreign DDL commands. Now foreign DDL support commands like:
```
CREATE TEMPORARY TABLE avroTable
USING org.apache.spark.sql.avro
OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")
```
With this PR user can define schema instead of infer from file, so support ddl command as follows:
```
CREATE TEMPORARY TABLE avroTable(a int, b string)
USING org.apache.spark.sql.avro
OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")
```
Author: scwf <wangfei1@huawei.com>
Author: Yin Huai <yhuai@databricks.com>
Author: Fei Wang <wangfei1@huawei.com>
Author: wangfei <wangfei1@huawei.com>
Closes #3431 from scwf/ddl and squashes the following commits:
7e79ce5 [Fei Wang] Merge pull request #22 from yhuai/pr3431yin
38f634e [Yin Huai] Remove Option from createRelation.
65e9c73 [Yin Huai] Revert all changes since applying a given schema has not been testd.
a852b10 [scwf] remove cleanIdentifier
f336a16 [Fei Wang] Merge pull request #21 from yhuai/pr3431yin
baf79b5 [Yin Huai] Test special characters quoted by backticks.
50a03b0 [Yin Huai] Use JsonRDD.nullTypeToStringType to convert NullType to StringType.
1eeb769 [Fei Wang] Merge pull request #20 from yhuai/pr3431yin
f5c22b0 [Yin Huai] Refactor code and update test cases.
f1cffe4 [Yin Huai] Revert "minor refactory"
b621c8f [scwf] minor refactory
d02547f [scwf] fix HiveCompatibilitySuite test failure
8dfbf7a [scwf] more tests for complex data type
ddab984 [Fei Wang] Merge pull request #19 from yhuai/pr3431yin
91ad91b [Yin Huai] Parse data types in DDLParser.
cf982d2 [scwf] fixed test failure
445b57b [scwf] address comments
02a662c [scwf] style issue
44eb70c [scwf] fix decimal parser issue
83b6fc3 [scwf] minor fix
9bf12f8 [wangfei] adding test case
7787ec7 [wangfei] added SchemaRelationProvider
0ba70df [wangfei] draft version
Diffstat (limited to 'sql/hive')
-rw-r--r-- | sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala | 114 | ||||
-rw-r--r-- | sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala | 5 |
2 files changed, 29 insertions, 90 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 2c859894cf..c25288e000 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -20,12 +20,7 @@ package org.apache.spark.sql.hive import java.io.IOException import java.util.{List => JList} -import org.apache.spark.sql.execution.SparkPlan - -import scala.util.parsing.combinator.RegexParsers - import org.apache.hadoop.util.ReflectionUtils - import org.apache.hadoop.hive.metastore.TableType import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition} @@ -37,7 +32,6 @@ import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException} import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.spark.Logging -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.analysis.{Catalog, OverrideCatalog} import org.apache.spark.sql.catalyst.expressions._ @@ -412,88 +406,6 @@ private[hive] case class InsertIntoHiveTable( } } -/** - * :: DeveloperApi :: - * Provides conversions between Spark SQL data types and Hive Metastore types. - */ -@DeveloperApi -object HiveMetastoreTypes extends RegexParsers { - protected lazy val primitiveType: Parser[DataType] = - "string" ^^^ StringType | - "float" ^^^ FloatType | - "int" ^^^ IntegerType | - "tinyint" ^^^ ByteType | - "smallint" ^^^ ShortType | - "double" ^^^ DoubleType | - "bigint" ^^^ LongType | - "binary" ^^^ BinaryType | - "boolean" ^^^ BooleanType | - fixedDecimalType | // Hive 0.13+ decimal with precision/scale - "decimal" ^^^ DecimalType.Unlimited | // Hive 0.12 decimal with no precision/scale - "date" ^^^ DateType | - "timestamp" ^^^ TimestampType | - "varchar\\((\\d+)\\)".r ^^^ StringType - - protected lazy val fixedDecimalType: Parser[DataType] = - ("decimal" ~> "(" ~> "\\d+".r) ~ ("," ~> "\\d+".r <~ ")") ^^ { - case precision ~ scale => - DecimalType(precision.toInt, scale.toInt) - } - - protected lazy val arrayType: Parser[DataType] = - "array" ~> "<" ~> dataType <~ ">" ^^ { - case tpe => ArrayType(tpe) - } - - protected lazy val mapType: Parser[DataType] = - "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { - case t1 ~ _ ~ t2 => MapType(t1, t2) - } - - protected lazy val structField: Parser[StructField] = - "[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ { - case name ~ _ ~ tpe => StructField(name, tpe, nullable = true) - } - - protected lazy val structType: Parser[DataType] = - "struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ { - case fields => new StructType(fields) - } - - protected lazy val dataType: Parser[DataType] = - arrayType | - mapType | - structType | - primitiveType - - def toDataType(metastoreType: String): DataType = parseAll(dataType, metastoreType) match { - case Success(result, _) => result - case failure: NoSuccess => sys.error(s"Unsupported dataType: $metastoreType") - } - - def toMetastoreType(dt: DataType): String = dt match { - case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>" - case StructType(fields) => - s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>" - case MapType(keyType, valueType, _) => - s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>" - case StringType => "string" - case FloatType => "float" - case IntegerType => "int" - case ByteType => "tinyint" - case ShortType => "smallint" - case DoubleType => "double" - case LongType => "bigint" - case BinaryType => "binary" - case BooleanType => "boolean" - case DateType => "date" - case d: DecimalType => HiveShim.decimalMetastoreString(d) - case TimestampType => "timestamp" - case NullType => "void" - case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType) - } -} - private[hive] case class MetastoreRelation (databaseName: String, tableName: String, alias: Option[String]) (val table: TTable, val partitions: Seq[TPartition]) @@ -551,7 +463,7 @@ private[hive] case class MetastoreRelation implicit class SchemaAttribute(f: FieldSchema) { def toAttribute = AttributeReference( f.getName, - HiveMetastoreTypes.toDataType(f.getType), + sqlContext.ddlParser.parseType(f.getType), // Since data can be dumped in randomly with no validation, everything is nullable. nullable = true )(qualifiers = Seq(alias.getOrElse(tableName))) @@ -571,3 +483,27 @@ private[hive] case class MetastoreRelation /** An attribute map for determining the ordinal for non-partition columns. */ val columnOrdinals = AttributeMap(attributes.zipWithIndex) } + +object HiveMetastoreTypes { + def toMetastoreType(dt: DataType): String = dt match { + case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>" + case StructType(fields) => + s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>" + case MapType(keyType, valueType, _) => + s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>" + case StringType => "string" + case FloatType => "float" + case IntegerType => "int" + case ByteType => "tinyint" + case ShortType => "smallint" + case DoubleType => "double" + case LongType => "bigint" + case BinaryType => "binary" + case BooleanType => "boolean" + case DateType => "date" + case d: DecimalType => HiveShim.decimalMetastoreString(d) + case TimestampType => "timestamp" + case NullType => "void" + case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 86535f8dd4..041a36f129 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive import org.scalatest.FunSuite import org.apache.spark.sql.catalyst.types.StructType +import org.apache.spark.sql.sources.DDLParser import org.apache.spark.sql.test.ExamplePointUDT class HiveMetastoreCatalogSuite extends FunSuite { @@ -27,7 +28,9 @@ class HiveMetastoreCatalogSuite extends FunSuite { test("struct field should accept underscore in sub-column name") { val metastr = "struct<a: int, b_1: string, c: string>" - val datatype = HiveMetastoreTypes.toDataType(metastr) + val ddlParser = new DDLParser + + val datatype = ddlParser.parseType(metastr) assert(datatype.isInstanceOf[StructType]) } |