aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJurriaan Pruis <email@jurriaanpruis.nl>2016-07-08 11:45:41 -0700
committerReynold Xin <rxin@databricks.com>2016-07-08 11:45:41 -0700
commit38cf8f2a50068f80350740ac28e31c8accd20634 (patch)
treef86ea19bcb87b7b44ffdbf8092e1bd56e8bb5000
parent255d74fe4a0db2cc842177ec735bbde07c7c8732 (diff)
downloadspark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.gz
spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.bz2
spark-38cf8f2a50068f80350740ac28e31c8accd20634.zip
[SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter
## What changes were proposed in this pull request? Adds an quoteAll option for writing CSV which will quote all fields. See https://issues.apache.org/jira/browse/SPARK-13638 ## How was this patch tested? Added a test to verify the output columns are quoted for all fields in the Dataframe Author: Jurriaan Pruis <email@jurriaanpruis.nl> Closes #13374 from jurriaan/csv-quote-all.
-rw-r--r--python/pyspark/sql/readwriter.py7
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala26
5 files changed, 36 insertions, 3 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 78d992e415..f7c354f513 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -633,7 +633,7 @@ class DataFrameWriter(OptionUtils):
@since(2.0)
def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
- header=None, nullValue=None, escapeQuotes=None):
+ header=None, nullValue=None, escapeQuotes=None, quoteAll=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
@@ -658,6 +658,9 @@ class DataFrameWriter(OptionUtils):
:param escapeQuotes: A flag indicating whether values containing quotes should always
be enclosed in quotes. If None is set, it uses the default value
``true``, escaping all values containing a quote character.
+ :param quoteAll: A flag indicating whether all values should always be enclosed in
+ quotes. If None is set, it uses the default value ``false``,
+ only escaping values containing a quote character.
:param header: writes the names of columns as the first line. If None is set, it uses
the default value, ``false``.
:param nullValue: sets the string representation of a null value. If None is set, it uses
@@ -667,7 +670,7 @@ class DataFrameWriter(OptionUtils):
"""
self.mode(mode)
self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header,
- nullValue=nullValue, escapeQuotes=escapeQuotes)
+ nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll)
self._jwrite.csv(path)
@since(1.5)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index f77af76d2b..12b304623d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -537,6 +537,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
* quotes should always be enclosed in quotes. Default is to escape all values containing
* a quote character.</li>
+ * <li>`quoteAll` (default `false`): A flag indicating whether all values should always be
+ * enclosed in quotes. Default is to only escape values containing a quote character.</li>
* <li>`header` (default `false`): writes the names of columns as the first line.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 581eda7e09..22fb8163b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -115,6 +115,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str
val maxMalformedLogPerPartition = getInt("maxMalformedLogPerPartition", 10)
+ val quoteAll = getBool("quoteAll", false)
+
val inputBufferSize = 128
val isCommentSet = this.comment != '\u0000'
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index bf62732dd4..13ae76d498 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -78,7 +78,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
writerSettings.setNullValue(params.nullValue)
writerSettings.setEmptyValue(params.nullValue)
writerSettings.setSkipEmptyLines(true)
- writerSettings.setQuoteAllFields(false)
+ writerSettings.setQuoteAllFields(params.quoteAll)
writerSettings.setHeaders(headers: _*)
writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f170065132..311f1fa8d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -366,6 +366,32 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
}
+ test("save csv with quoteAll enabled") {
+ withTempDir { dir =>
+ val csvDir = new File(dir, "csv").getCanonicalPath
+
+ val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+ val df = spark.createDataFrame(data)
+
+ // escapeQuotes should be true by default
+ df.coalesce(1).write
+ .format("csv")
+ .option("quote", "\"")
+ .option("escape", "\"")
+ .option("quoteAll", "true")
+ .save(csvDir)
+
+ val results = spark.read
+ .format("text")
+ .load(csvDir)
+ .collect()
+
+ val expected = "\"test \"\"quote\"\"\",\"123\",\"it \"\"works\"\"!\",\"\"\"very\"\" well\""
+
+ assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+ }
+ }
+
test("save csv with quote escaping enabled") {
withTempDir { dir =>
val csvDir = new File(dir, "csv").getCanonicalPath