aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Armbrust <michael@databricks.com>2014-09-10 20:57:38 -0700
committerMichael Armbrust <michael@databricks.com>2014-09-10 20:57:38 -0700
commit84e2c8bfe41837baf2aeffa9741e4dbd14351981 (patch)
treeee307abd6291da4da321a7fa8f1212c18d810a8f
parent79cdb9b64ad2fa3ab7f2c221766d36658b917c40 (diff)
downloadspark-84e2c8bfe41837baf2aeffa9741e4dbd14351981.tar.gz
spark-84e2c8bfe41837baf2aeffa9741e4dbd14351981.tar.bz2
spark-84e2c8bfe41837baf2aeffa9741e4dbd14351981.zip
[SQL] Add test case with workaround for reading partitioned Avro files
In order to read from partitioned Avro files we need to also set the `SERDEPROPERTIES` since `TBLPROPERTIES` are not passed to the initialization. This PR simply adds a test to make sure we don't break this workaround. Author: Michael Armbrust <michael@databricks.com> Closes #2340 from marmbrus/avroPartitioned and squashes the following commits: 6b969d6 [Michael Armbrust] fix style fea2124 [Michael Armbrust] Add test case with workaround for reading partitioned avro files.
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala69
-rw-r--r--sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de8
-rw-r--r--sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala2
3 files changed, 78 insertions, 1 deletions
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index a013f3f7a8..6974f3e581 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -269,7 +269,74 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
|)
""".stripMargin.cmd,
s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}' INTO TABLE episodes".cmd
- )
+ ),
+ // THIS TABLE IS NOT THE SAME AS THE HIVE TEST TABLE episodes_partitioned AS DYNAMIC PARITIONING
+ // IS NOT YET SUPPORTED
+ TestTable("episodes_part",
+ s"""CREATE TABLE episodes_part (title STRING, air_date STRING, doctor INT)
+ |PARTITIONED BY (doctor_pt INT)
+ |ROW FORMAT SERDE '${classOf[AvroSerDe].getCanonicalName}'
+ |STORED AS
+ |INPUTFORMAT '${classOf[AvroContainerInputFormat].getCanonicalName}'
+ |OUTPUTFORMAT '${classOf[AvroContainerOutputFormat].getCanonicalName}'
+ |TBLPROPERTIES (
+ | 'avro.schema.literal'='{
+ | "type": "record",
+ | "name": "episodes",
+ | "namespace": "testing.hive.avro.serde",
+ | "fields": [
+ | {
+ | "name": "title",
+ | "type": "string",
+ | "doc": "episode title"
+ | },
+ | {
+ | "name": "air_date",
+ | "type": "string",
+ | "doc": "initial date"
+ | },
+ | {
+ | "name": "doctor",
+ | "type": "int",
+ | "doc": "main actor playing the Doctor in episode"
+ | }
+ | ]
+ | }'
+ |)
+ """.stripMargin.cmd,
+ // WORKAROUND: Required to pass schema to SerDe for partitioned tables.
+ // TODO: Pass this automatically from the table to partitions.
+ s"""
+ |ALTER TABLE episodes_part SET SERDEPROPERTIES (
+ | 'avro.schema.literal'='{
+ | "type": "record",
+ | "name": "episodes",
+ | "namespace": "testing.hive.avro.serde",
+ | "fields": [
+ | {
+ | "name": "title",
+ | "type": "string",
+ | "doc": "episode title"
+ | },
+ | {
+ | "name": "air_date",
+ | "type": "string",
+ | "doc": "initial date"
+ | },
+ | {
+ | "name": "doctor",
+ | "type": "int",
+ | "doc": "main actor playing the Doctor in episode"
+ | }
+ | ]
+ | }'
+ |)
+ """.stripMargin.cmd,
+ s"""
+ INSERT OVERWRITE TABLE episodes_part PARTITION (doctor_pt=1)
+ SELECT title, air_date, doctor FROM episodes
+ """.cmd
+ )
)
hiveQTestUtilTables.foreach(registerTestTable)
diff --git a/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de b/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de
new file mode 100644
index 0000000000..49c8434730
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de
@@ -0,0 +1,8 @@
+The Eleventh Hour 3 April 2010 11 1
+The Doctor's Wife 14 May 2011 11 1
+Horror of Fang Rock 3 September 1977 4 1
+An Unearthly Child 23 November 1963 1 1
+The Mysterious Planet 6 September 1986 6 1
+Rose 26 March 2005 9 1
+The Power of the Daleks 5 November 1966 2 1
+Castrolava 4 January 1982 5 1
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index 8bc72384a6..7486bfa82b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -37,4 +37,6 @@ class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
createQueryTest("Read with RegexSerDe", "SELECT * FROM sales")
createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")
+
+ createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}