aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorscwf <wangfei1@huawei.com>2015-01-10 13:53:21 -0800
committerMichael Armbrust <michael@databricks.com>2015-01-10 13:53:21 -0800
commit693a323a70aba91e6c100dd5561d218a75b7895e (patch)
tree3604ec22163d5296496d1d1907e6b3edbafc0108 /sql/hive
parent4b39fd1e63188821fc84a13f7ccb6e94277f4be7 (diff)
downloadspark-693a323a70aba91e6c100dd5561d218a75b7895e.tar.gz
spark-693a323a70aba91e6c100dd5561d218a75b7895e.tar.bz2
spark-693a323a70aba91e6c100dd5561d218a75b7895e.zip
[SPARK-4574][SQL] Adding support for defining schema in foreign DDL commands.
Adding support for defining schema in foreign DDL commands. Now foreign DDL support commands like: ``` CREATE TEMPORARY TABLE avroTable USING org.apache.spark.sql.avro OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro") ``` With this PR user can define schema instead of infer from file, so support ddl command as follows: ``` CREATE TEMPORARY TABLE avroTable(a int, b string) USING org.apache.spark.sql.avro OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro") ``` Author: scwf <wangfei1@huawei.com> Author: Yin Huai <yhuai@databricks.com> Author: Fei Wang <wangfei1@huawei.com> Author: wangfei <wangfei1@huawei.com> Closes #3431 from scwf/ddl and squashes the following commits: 7e79ce5 [Fei Wang] Merge pull request #22 from yhuai/pr3431yin 38f634e [Yin Huai] Remove Option from createRelation. 65e9c73 [Yin Huai] Revert all changes since applying a given schema has not been testd. a852b10 [scwf] remove cleanIdentifier f336a16 [Fei Wang] Merge pull request #21 from yhuai/pr3431yin baf79b5 [Yin Huai] Test special characters quoted by backticks. 50a03b0 [Yin Huai] Use JsonRDD.nullTypeToStringType to convert NullType to StringType. 1eeb769 [Fei Wang] Merge pull request #20 from yhuai/pr3431yin f5c22b0 [Yin Huai] Refactor code and update test cases. f1cffe4 [Yin Huai] Revert "minor refactory" b621c8f [scwf] minor refactory d02547f [scwf] fix HiveCompatibilitySuite test failure 8dfbf7a [scwf] more tests for complex data type ddab984 [Fei Wang] Merge pull request #19 from yhuai/pr3431yin 91ad91b [Yin Huai] Parse data types in DDLParser. cf982d2 [scwf] fixed test failure 445b57b [scwf] address comments 02a662c [scwf] style issue 44eb70c [scwf] fix decimal parser issue 83b6fc3 [scwf] minor fix 9bf12f8 [wangfei] adding test case 7787ec7 [wangfei] added SchemaRelationProvider 0ba70df [wangfei] draft version
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala114
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala5
2 files changed, 29 insertions, 90 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 2c859894cf..c25288e000 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -20,12 +20,7 @@ package org.apache.spark.sql.hive
import java.io.IOException
import java.util.{List => JList}
-import org.apache.spark.sql.execution.SparkPlan
-
-import scala.util.parsing.combinator.RegexParsers
-
import org.apache.hadoop.util.ReflectionUtils
-
import org.apache.hadoop.hive.metastore.TableType
import org.apache.hadoop.hive.metastore.api.FieldSchema
import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition}
@@ -37,7 +32,6 @@ import org.apache.hadoop.hive.serde2.{Deserializer, SerDeException}
import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.analysis.{Catalog, OverrideCatalog}
import org.apache.spark.sql.catalyst.expressions._
@@ -412,88 +406,6 @@ private[hive] case class InsertIntoHiveTable(
}
}
-/**
- * :: DeveloperApi ::
- * Provides conversions between Spark SQL data types and Hive Metastore types.
- */
-@DeveloperApi
-object HiveMetastoreTypes extends RegexParsers {
- protected lazy val primitiveType: Parser[DataType] =
- "string" ^^^ StringType |
- "float" ^^^ FloatType |
- "int" ^^^ IntegerType |
- "tinyint" ^^^ ByteType |
- "smallint" ^^^ ShortType |
- "double" ^^^ DoubleType |
- "bigint" ^^^ LongType |
- "binary" ^^^ BinaryType |
- "boolean" ^^^ BooleanType |
- fixedDecimalType | // Hive 0.13+ decimal with precision/scale
- "decimal" ^^^ DecimalType.Unlimited | // Hive 0.12 decimal with no precision/scale
- "date" ^^^ DateType |
- "timestamp" ^^^ TimestampType |
- "varchar\\((\\d+)\\)".r ^^^ StringType
-
- protected lazy val fixedDecimalType: Parser[DataType] =
- ("decimal" ~> "(" ~> "\\d+".r) ~ ("," ~> "\\d+".r <~ ")") ^^ {
- case precision ~ scale =>
- DecimalType(precision.toInt, scale.toInt)
- }
-
- protected lazy val arrayType: Parser[DataType] =
- "array" ~> "<" ~> dataType <~ ">" ^^ {
- case tpe => ArrayType(tpe)
- }
-
- protected lazy val mapType: Parser[DataType] =
- "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
- case t1 ~ _ ~ t2 => MapType(t1, t2)
- }
-
- protected lazy val structField: Parser[StructField] =
- "[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ {
- case name ~ _ ~ tpe => StructField(name, tpe, nullable = true)
- }
-
- protected lazy val structType: Parser[DataType] =
- "struct" ~> "<" ~> repsep(structField,",") <~ ">" ^^ {
- case fields => new StructType(fields)
- }
-
- protected lazy val dataType: Parser[DataType] =
- arrayType |
- mapType |
- structType |
- primitiveType
-
- def toDataType(metastoreType: String): DataType = parseAll(dataType, metastoreType) match {
- case Success(result, _) => result
- case failure: NoSuccess => sys.error(s"Unsupported dataType: $metastoreType")
- }
-
- def toMetastoreType(dt: DataType): String = dt match {
- case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
- case StructType(fields) =>
- s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"
- case MapType(keyType, valueType, _) =>
- s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>"
- case StringType => "string"
- case FloatType => "float"
- case IntegerType => "int"
- case ByteType => "tinyint"
- case ShortType => "smallint"
- case DoubleType => "double"
- case LongType => "bigint"
- case BinaryType => "binary"
- case BooleanType => "boolean"
- case DateType => "date"
- case d: DecimalType => HiveShim.decimalMetastoreString(d)
- case TimestampType => "timestamp"
- case NullType => "void"
- case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType)
- }
-}
-
private[hive] case class MetastoreRelation
(databaseName: String, tableName: String, alias: Option[String])
(val table: TTable, val partitions: Seq[TPartition])
@@ -551,7 +463,7 @@ private[hive] case class MetastoreRelation
implicit class SchemaAttribute(f: FieldSchema) {
def toAttribute = AttributeReference(
f.getName,
- HiveMetastoreTypes.toDataType(f.getType),
+ sqlContext.ddlParser.parseType(f.getType),
// Since data can be dumped in randomly with no validation, everything is nullable.
nullable = true
)(qualifiers = Seq(alias.getOrElse(tableName)))
@@ -571,3 +483,27 @@ private[hive] case class MetastoreRelation
/** An attribute map for determining the ordinal for non-partition columns. */
val columnOrdinals = AttributeMap(attributes.zipWithIndex)
}
+
+object HiveMetastoreTypes {
+ def toMetastoreType(dt: DataType): String = dt match {
+ case ArrayType(elementType, _) => s"array<${toMetastoreType(elementType)}>"
+ case StructType(fields) =>
+ s"struct<${fields.map(f => s"${f.name}:${toMetastoreType(f.dataType)}").mkString(",")}>"
+ case MapType(keyType, valueType, _) =>
+ s"map<${toMetastoreType(keyType)},${toMetastoreType(valueType)}>"
+ case StringType => "string"
+ case FloatType => "float"
+ case IntegerType => "int"
+ case ByteType => "tinyint"
+ case ShortType => "smallint"
+ case DoubleType => "double"
+ case LongType => "bigint"
+ case BinaryType => "binary"
+ case BooleanType => "boolean"
+ case DateType => "date"
+ case d: DecimalType => HiveShim.decimalMetastoreString(d)
+ case TimestampType => "timestamp"
+ case NullType => "void"
+ case udt: UserDefinedType[_] => toMetastoreType(udt.sqlType)
+ }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 86535f8dd4..041a36f129 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
import org.scalatest.FunSuite
import org.apache.spark.sql.catalyst.types.StructType
+import org.apache.spark.sql.sources.DDLParser
import org.apache.spark.sql.test.ExamplePointUDT
class HiveMetastoreCatalogSuite extends FunSuite {
@@ -27,7 +28,9 @@ class HiveMetastoreCatalogSuite extends FunSuite {
test("struct field should accept underscore in sub-column name") {
val metastr = "struct<a: int, b_1: string, c: string>"
- val datatype = HiveMetastoreTypes.toDataType(metastr)
+ val ddlParser = new DDLParser
+
+ val datatype = ddlParser.parseType(metastr)
assert(datatype.isInstanceOf[StructType])
}