aboutsummaryrefslogtreecommitdiff
path: root/sql/hive
diff options
context:
space:
mode:
authorSameer Agarwal <sameer@databricks.com>2016-01-26 07:50:37 -0800
committerYin Huai <yhuai@databricks.com>2016-01-26 07:50:37 -0800
commit08c781ca672820be9ba32838bbe40d2643c4bde4 (patch)
tree7d77b7fb5a18967125ddaf4736fb98ecd83ec88f /sql/hive
parentae0309a8812a4fade3a0ea67d8986ca870aeb9eb (diff)
downloadspark-08c781ca672820be9ba32838bbe40d2643c4bde4.tar.gz
spark-08c781ca672820be9ba32838bbe40d2643c4bde4.tar.bz2
spark-08c781ca672820be9ba32838bbe40d2643c4bde4.zip
[SPARK-12682][SQL] Add support for (optionally) not storing tables in hive metadata format
This PR adds a new table option (`skip_hive_metadata`) that'd allow the user to skip storing the table metadata in hive metadata format. While this could be useful in general, the specific use-case for this change is that Hive doesn't handle wide schemas well (see https://issues.apache.org/jira/browse/SPARK-12682 and https://issues.apache.org/jira/browse/SPARK-6024) which in turn prevents such tables from being queried in SparkSQL. Author: Sameer Agarwal <sameer@databricks.com> Closes #10826 from sameeragarwal/skip-hive-metadata.
Diffstat (limited to 'sql/hive')
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala7
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala32
2 files changed, 39 insertions, 0 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 0cfe03ba91..80e45d5162 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -327,7 +327,14 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
// TODO: Support persisting partitioned data source relations in Hive compatible format
val qualifiedTableName = tableIdent.quotedString
+ val skipHiveMetadata = options.getOrElse("skipHiveMetadata", "false").toBoolean
val (hiveCompatibleTable, logMessage) = (maybeSerDe, dataSource.relation) match {
+ case _ if skipHiveMetadata =>
+ val message =
+ s"Persisting partitioned data source relation $qualifiedTableName into " +
+ "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive."
+ (None, message)
+
case (Some(serde), relation: HadoopFsRelation)
if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index 211932fea0..d9e4b020fd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -900,4 +900,36 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
sqlContext.sql("""use default""")
sqlContext.sql("""drop database if exists testdb8156 CASCADE""")
}
+
+ test("skip hive metadata on table creation") {
+ val schema = StructType((1 to 5).map(i => StructField(s"c_$i", StringType)))
+
+ catalog.createDataSourceTable(
+ tableIdent = TableIdentifier("not_skip_hive_metadata"),
+ userSpecifiedSchema = Some(schema),
+ partitionColumns = Array.empty[String],
+ bucketSpec = None,
+ provider = "parquet",
+ options = Map("path" -> "just a dummy path", "skipHiveMetadata" -> "false"),
+ isExternal = false)
+
+ // As a proxy for verifying that the table was stored in Hive compatible format, we verify that
+ // each column of the table is of native type StringType.
+ assert(catalog.client.getTable("default", "not_skip_hive_metadata").schema
+ .forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == StringType))
+
+ catalog.createDataSourceTable(
+ tableIdent = TableIdentifier("skip_hive_metadata"),
+ userSpecifiedSchema = Some(schema),
+ partitionColumns = Array.empty[String],
+ bucketSpec = None,
+ provider = "parquet",
+ options = Map("path" -> "just a dummy path", "skipHiveMetadata" -> "true"),
+ isExternal = false)
+
+ // As a proxy for verifying that the table was stored in SparkSQL format, we verify that
+ // the table has a column type as array of StringType.
+ assert(catalog.client.getTable("default", "skip_hive_metadata").schema
+ .forall(column => HiveMetastoreTypes.toDataType(column.hiveType) == ArrayType(StringType)))
+ }
}