From bb2b9d0a428b86bf366ee9916e26402f8c00912f Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Tue, 9 Aug 2016 10:23:54 +0800 Subject: [SPARK-16610][SQL] Add `orc.compress` as an alias for `compression` option. ## What changes were proposed in this pull request? For ORC source, Spark SQL has a writer option `compression`, which is used to set the codec and its value will be also set to `orc.compress` (the orc conf used for codec). However, if a user only set `orc.compress` in the writer option, we should not use the default value of `compression` (snappy) as the codec. Instead, we should respect the value of `orc.compress`. This PR makes ORC data source not ignoring `orc.compress` when `comperssion` is unset. So, here is the behaviour, 1. Check `compression` and use this if it is set. 2. If `compression` is not set, check `orc.compress` and use it. 3. If `compression` and `orc.compress` are not set, then use the default snappy. ## How was this patch tested? Unit test in `OrcQuerySuite`. Author: hyukjinkwon Closes #14518 from HyukjinKwon/SPARK-16610. --- .../org/apache/spark/sql/hive/orc/OrcOptions.scala | 12 ++++++++--- .../apache/spark/sql/hive/orc/OrcQuerySuite.scala | 23 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala index 91cf0dc960..c2a126d3bf 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala @@ -20,8 +20,7 @@ package org.apache.spark.sql.hive.orc /** * Options for the ORC data source. */ -private[orc] class OrcOptions( - @transient private val parameters: Map[String, String]) +private[orc] class OrcOptions(@transient private val parameters: Map[String, String]) extends Serializable { import OrcOptions._ @@ -31,7 +30,14 @@ private[orc] class OrcOptions( * Acceptable values are defined in [[shortOrcCompressionCodecNames]]. */ val compressionCodec: String = { - val codecName = parameters.getOrElse("compression", "snappy").toLowerCase + // `orc.compress` is a ORC configuration. So, here we respect this as an option but + // `compression` has higher precedence than `orc.compress`. It means if both are set, + // we will use `compression`. + val orcCompressionConf = parameters.get(OrcRelation.ORC_COMPRESSION) + val codecName = parameters + .get("compression") + .orElse(orcCompressionConf) + .getOrElse("snappy").toLowerCase if (!shortOrcCompressionCodecNames.contains(codecName)) { val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase) throw new IllegalArgumentException(s"Codec [$codecName] " + diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala index 49e963ee12..b13878d578 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala @@ -161,6 +161,29 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest { } } + test("SPARK-16610: Respect orc.compress option when compression is unset") { + // Respect `orc.compress`. + withTempPath { file => + spark.range(0, 10).write + .option("orc.compress", "ZLIB") + .orc(file.getCanonicalPath) + val expectedCompressionKind = + OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression + assert("ZLIB" === expectedCompressionKind.name()) + } + + // `compression` overrides `orc.compress`. + withTempPath { file => + spark.range(0, 10).write + .option("compression", "ZLIB") + .option("orc.compress", "SNAPPY") + .orc(file.getCanonicalPath) + val expectedCompressionKind = + OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression + assert("ZLIB" === expectedCompressionKind.name()) + } + } + // Hive supports zlib, snappy and none for Hive 1.2.1. test("Compression options for writing to an ORC file (SNAPPY, ZLIB and NONE)") { withTempPath { file => -- cgit v1.2.3