[SPARK-13638][SQL] Add quoteAll option to CSV DataFrameWriter

## What changes were proposed in this pull request? Adds an quoteAll option for writing CSV which will quote all fields. See https://issues.apache.org/jira/browse/SPARK-13638 ## How was this patch tested? Added a test to verify the output columns are quoted for all fields in the Dataframe Author: Jurriaan Pruis <email@jurriaanpruis.nl> Closes #13374 from jurriaan/csv-quote-all.
author: Jurriaan Pruis <email@jurriaanpruis.nl> 2016-07-08 11:45:41 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-07-08 11:45:41 -0700
commit: 38cf8f2a50068f80350740ac28e31c8accd20634 (patch)
tree: f86ea19bcb87b7b44ffdbf8092e1bd56e8bb5000
parent: 255d74fe4a0db2cc842177ec735bbde07c7c8732 (diff)
download: spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.gz
spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.bz2
spark-38cf8f2a50068f80350740ac28e31c8accd20634.zip
5 files changed, 36 insertions, 3 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 78d992e415..f7c354f513 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -633,7 +633,7 @@ class DataFrameWriter(OptionUtils):
 
     @since(2.0)
     def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
-            header=None, nullValue=None, escapeQuotes=None):
+            header=None, nullValue=None, escapeQuotes=None, quoteAll=None):
         """Saves the content of the :class:`DataFrame` in CSV format at the specified path.
 
         :param path: the path in any Hadoop supported file system
@@ -658,6 +658,9 @@ class DataFrameWriter(OptionUtils):
         :param escapeQuotes: A flag indicating whether values containing quotes should always
                              be enclosed in quotes. If None is set, it uses the default value
                              ``true``, escaping all values containing a quote character.
+        :param quoteAll: A flag indicating whether all values should always be enclosed in
+                          quotes. If None is set, it uses the default value ``false``,
+                          only escaping values containing a quote character.
         :param header: writes the names of columns as the first line. If None is set, it uses
                        the default value, ``false``.
         :param nullValue: sets the string representation of a null value. If None is set, it uses
@@ -667,7 +670,7 @@ class DataFrameWriter(OptionUtils):
         """
         self.mode(mode)
         self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header,
-                       nullValue=nullValue, escapeQuotes=escapeQuotes)
+                       nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll)
         self._jwrite.csv(path)
 
     @since(1.5)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index f77af76d2b..12b304623d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -537,6 +537,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
    * quotes should always be enclosed in quotes. Default is to escape all values containing
    * a quote character.</li>
+   * <li>`quoteAll` (default `false`): A flag indicating whether all values should always be
+   * enclosed in quotes. Default is to only escape values containing a quote character.</li>
    * <li>`header` (default `false`): writes the names of columns as the first line.</li>
    * <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 581eda7e09..22fb8163b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -115,6 +115,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str
 
   val maxMalformedLogPerPartition = getInt("maxMalformedLogPerPartition", 10)
 
+  val quoteAll = getBool("quoteAll", false)
+
   val inputBufferSize = 128
 
   val isCommentSet = this.comment != '\u0000'
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index bf62732dd4..13ae76d498 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -78,7 +78,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
   writerSettings.setNullValue(params.nullValue)
   writerSettings.setEmptyValue(params.nullValue)
   writerSettings.setSkipEmptyLines(true)
-  writerSettings.setQuoteAllFields(false)
+  writerSettings.setQuoteAllFields(params.quoteAll)
   writerSettings.setHeaders(headers: _*)
   writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f170065132..311f1fa8d2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -366,6 +366,32 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     }
   }
 
+  test("save csv with quoteAll enabled") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+
+      val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+      val df = spark.createDataFrame(data)
+
+      // escapeQuotes should be true by default
+      df.coalesce(1).write
+        .format("csv")
+        .option("quote", "\"")
+        .option("escape", "\"")
+        .option("quoteAll", "true")
+        .save(csvDir)
+
+      val results = spark.read
+        .format("text")
+        .load(csvDir)
+        .collect()
+
+      val expected = "\"test \"\"quote\"\"\",\"123\",\"it \"\"works\"\"!\",\"\"\"very\"\" well\""
+
+      assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+    }
+  }
+
   test("save csv with quote escaping enabled") {
     withTempDir { dir =>
       val csvDir = new File(dir, "csv").getCanonicalPath
author	Jurriaan Pruis <email@jurriaanpruis.nl>	2016-07-08 11:45:41 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-07-08 11:45:41 -0700
commit	38cf8f2a50068f80350740ac28e31c8accd20634 (patch)
tree	f86ea19bcb87b7b44ffdbf8092e1bd56e8bb5000
parent	255d74fe4a0db2cc842177ec735bbde07c7c8732 (diff)
download	spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.gz spark-38cf8f2a50068f80350740ac28e31c8accd20634.tar.bz2 spark-38cf8f2a50068f80350740ac28e31c8accd20634.zip