diff options
author | Jurriaan Pruis <email@jurriaanpruis.nl> | 2016-07-08 11:45:41 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-07-08 11:45:41 -0700 |
commit | 38cf8f2a50068f80350740ac28e31c8accd20634 (patch) | |
tree | f86ea19bcb87b7b44ffdbf8092e1bd56e8bb5000 /sql/core | |
parent | 255d74fe4a0db2cc842177ec735bbde07c7c8732 (diff) | |
download | spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.gz spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.bz2 spark-38cf8f2a50068f80350740ac28e31c8accd20634.zip |
[SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter
## What changes were proposed in this pull request?
Adds an quoteAll option for writing CSV which will quote all fields.
See https://issues.apache.org/jira/browse/SPARK-13638
## How was this patch tested?
Added a test to verify the output columns are quoted for all fields in the Dataframe
Author: Jurriaan Pruis <email@jurriaanpruis.nl>
Closes #13374 from jurriaan/csv-quote-all.
Diffstat (limited to 'sql/core')
4 files changed, 31 insertions, 1 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index f77af76d2b..12b304623d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -537,6 +537,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing * quotes should always be enclosed in quotes. Default is to escape all values containing * a quote character.</li> + * <li>`quoteAll` (default `false`): A flag indicating whether all values should always be + * enclosed in quotes. Default is to only escape values containing a quote character.</li> * <li>`header` (default `false`): writes the names of columns as the first line.</li> * <li>`nullValue` (default empty string): sets the string representation of a null value.</li> * <li>`compression` (default `null`): compression codec to use when saving to file. This can be diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 581eda7e09..22fb8163b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -115,6 +115,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str val maxMalformedLogPerPartition = getInt("maxMalformedLogPerPartition", 10) + val quoteAll = getBool("quoteAll", false) + val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala index bf62732dd4..13ae76d498 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala @@ -78,7 +78,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten writerSettings.setNullValue(params.nullValue) writerSettings.setEmptyValue(params.nullValue) writerSettings.setSkipEmptyLines(true) - writerSettings.setQuoteAllFields(false) + writerSettings.setQuoteAllFields(params.quoteAll) writerSettings.setHeaders(headers: _*) writerSettings.setQuoteEscapingEnabled(params.escapeQuotes) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index f170065132..311f1fa8d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -366,6 +366,32 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } + test("save csv with quoteAll enabled") { + withTempDir { dir => + val csvDir = new File(dir, "csv").getCanonicalPath + + val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well")) + val df = spark.createDataFrame(data) + + // escapeQuotes should be true by default + df.coalesce(1).write + .format("csv") + .option("quote", "\"") + .option("escape", "\"") + .option("quoteAll", "true") + .save(csvDir) + + val results = spark.read + .format("text") + .load(csvDir) + .collect() + + val expected = "\"test \"\"quote\"\"\",\"123\",\"it \"\"works\"\"!\",\"\"\"very\"\" well\"" + + assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected))) + } + } + test("save csv with quote escaping enabled") { withTempDir { dir => val csvDir = new File(dir, "csv").getCanonicalPath |