[SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV

## What changes were proposed in this pull request? Default QuoteEscapingEnabled flag to true when writing CSV and add an escapeQuotes option to be able to change this. See https://github.com/uniVocity/univocity-parsers/blob/f3eb2af26374940e60d91d1703bde54619f50c51/src/main/java/com/univocity/parsers/csv/CsvWriterSettings.java#L231-L247 This change is needed to be able to write RFC 4180 compatible CSV files (https://tools.ietf.org/html/rfc4180#section-2) https://issues.apache.org/jira/browse/SPARK-15493 ## How was this patch tested? Added a test that verifies the output is quoted correctly. Author: Jurriaan Pruis <email@jurriaanpruis.nl> Closes #13267 from jurriaan/quote-escaping.
author: Jurriaan Pruis <email@jurriaanpruis.nl> 2016-05-25 12:40:16 -0700
committer: Reynold Xin <rxin@databricks.com> 2016-05-25 12:40:16 -0700
commit: c875d81a3de3f209b9eb03adf96b7c740b2c7b52 (patch)
tree: f09a2b335d592b1c40e42b2f557e8f643768dc3e /sql
parent: 4b88067416ce922ae15a1445cf953fb9b5c43427 (diff)
download: spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.tar.gz
spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.tar.bz2
spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.zip
5 files changed, 58 insertions, 1 deletions
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2ea980bf20..b833b9369e 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -39,7 +39,7 @@
     <dependency>
       <groupId>com.univocity</groupId>
       <artifactId>univocity-parsers</artifactId>
-      <version>2.1.0</version>
+      <version>2.1.1</version>
       <type>jar</type>
     </dependency>
     <dependency>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 6f5fb69ea3..3aacce7d7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -684,6 +684,9 @@ final class DataFrameWriter private[sql](df: DataFrame) {
    * the separator can be part of the value.</li>
    * <li>`escape` (default `\`): sets the single character used for escaping quotes inside
    * an already quoted value.</li>
+   * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
+   * quotes should always be enclosed in quotes. Default is to escape all values containing
+   * a quote character.</li>
    * <li>`header` (default `false`): writes the names of columns as the first line.</li>
    * <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
    * <li>`compression` (default `null`): compression codec to use when saving to file. This can be
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index e4fd09462f..9f4ce8358b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -111,6 +111,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str
 
   val maxCharsPerColumn = getInt("maxCharsPerColumn", 1000000)
 
+  val escapeQuotes = getBool("escapeQuotes", true)
+
   val inputBufferSize = 128
 
   val isCommentSet = this.comment != '\u0000'
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index 111995da9c..b06f12369d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -75,6 +75,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
   writerSettings.setSkipEmptyLines(true)
   writerSettings.setQuoteAllFields(false)
   writerSettings.setHeaders(headers: _*)
+  writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
 
   private var buffer = new ByteArrayOutputStream()
   private var writer = new CsvWriter(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index bae290776f..ad7c05c12e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -366,6 +366,57 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     }
   }
 
+  test("save csv with quote escaping enabled") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+
+      val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+      val df = spark.createDataFrame(data)
+
+      // escapeQuotes should be true by default
+      df.coalesce(1).write
+        .format("csv")
+        .option("quote", "\"")
+        .option("escape", "\"")
+        .save(csvDir)
+
+      val results = spark.read
+        .format("text")
+        .load(csvDir)
+        .collect()
+
+      val expected = "\"test \"\"quote\"\"\",123,\"it \"\"works\"\"!\",\"\"\"very\"\" well\""
+
+      assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+    }
+  }
+
+  test("save csv with quote escaping disabled") {
+    withTempDir { dir =>
+      val csvDir = new File(dir, "csv").getCanonicalPath
+
+      val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+      val df = spark.createDataFrame(data)
+
+      // escapeQuotes should be true by default
+      df.coalesce(1).write
+        .format("csv")
+        .option("quote", "\"")
+        .option("escapeQuotes", "false")
+        .option("escape", "\"")
+        .save(csvDir)
+
+      val results = spark.read
+        .format("text")
+        .load(csvDir)
+        .collect()
+
+      val expected = "test \"quote\",123,it \"works\"!,\"\"\"very\"\" well\""
+
+      assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+    }
+  }
+
   test("commented lines in CSV data") {
     val results = spark.read
       .format("csv")
author	Jurriaan Pruis <email@jurriaanpruis.nl>	2016-05-25 12:40:16 -0700
committer	Reynold Xin <rxin@databricks.com>	2016-05-25 12:40:16 -0700
commit	c875d81a3de3f209b9eb03adf96b7c740b2c7b52 (patch)
tree	f09a2b335d592b1c40e42b2f557e8f643768dc3e /sql
parent	4b88067416ce922ae15a1445cf953fb9b5c43427 (diff)
download	spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.tar.gz spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.tar.bz2 spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.zip