aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2015-11-16 21:59:33 +0800
committerCheng Lian <lian@databricks.com>2015-11-16 21:59:33 +0800
commite388b39d10fc269cdd3d630ea7d4ae80fd0efa97 (patch)
tree3f6222bb834833bfd20c4b95df4459e9153f1ca2 /sql
parent7f8eb3bf6ed64eefc5472f5c5fb02e2db1e3f618 (diff)
downloadspark-e388b39d10fc269cdd3d630ea7d4ae80fd0efa97.tar.gz
spark-e388b39d10fc269cdd3d630ea7d4ae80fd0efa97.tar.bz2
spark-e388b39d10fc269cdd3d630ea7d4ae80fd0efa97.zip
[SPARK-11692][SQL] Support for Parquet logical types, JSON and BSON (embedded types)
Parquet supports some JSON and BSON datatypes. They are represented as binary for BSON and string (UTF-8) for JSON internally. I searched a bit and found Apache drill also supports both in this way, [link](https://drill.apache.org/docs/parquet-format/). Author: hyukjinkwon <gurwls223@gmail.com> Author: Hyukjin Kwon <gurwls223@gmail.com> Closes #9658 from HyukjinKwon/SPARK-11692.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala3
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala25
2 files changed, 27 insertions, 1 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index f28a18e275..5f9f908309 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -170,9 +170,10 @@ private[parquet] class CatalystSchemaConverter(
case BINARY =>
originalType match {
- case UTF8 | ENUM => StringType
+ case UTF8 | ENUM | JSON => StringType
case null if assumeBinaryIsString => StringType
case null => BinaryType
+ case BSON => BinaryType
case DECIMAL => makeDecimalType()
case _ => illegalType()
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 2aa5dca847..a148facd05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -259,6 +259,31 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
}
}
+ test("SPARK-11692 Support for Parquet logical types, JSON and BSON (embedded types)") {
+ val parquetSchema = MessageTypeParser.parseMessageType(
+ """message root {
+ | required binary a(JSON);
+ | required binary b(BSON);
+ |}
+ """.stripMargin)
+
+ withTempPath { location =>
+ val extraMetadata = Map.empty[String, String].asJava
+ val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, "Spark")
+ val path = new Path(location.getCanonicalPath)
+ val footer = List(
+ new Footer(path, new ParquetMetadata(fileMetadata, Collections.emptyList()))
+ ).asJava
+
+ ParquetFileWriter.writeMetadataFile(sparkContext.hadoopConfiguration, path, footer)
+
+ val jsonDataType = sqlContext.read.parquet(path.toString).schema(0).dataType
+ assert(jsonDataType === StringType)
+ val bsonDataType = sqlContext.read.parquet(path.toString).schema(1).dataType
+ assert(bsonDataType === BinaryType)
+ }
+ }
+
test("compression codec") {
def compressionCodecFor(path: String, codecName: String): String = {
val codecs = for {