aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJurriaan Pruis <email@jurriaanpruis.nl>2016-05-25 12:40:16 -0700
committerReynold Xin <rxin@databricks.com>2016-05-25 12:40:16 -0700
commitc875d81a3de3f209b9eb03adf96b7c740b2c7b52 (patch)
treef09a2b335d592b1c40e42b2f557e8f643768dc3e
parent4b88067416ce922ae15a1445cf953fb9b5c43427 (diff)
downloadspark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.tar.gz
spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.tar.bz2
spark-c875d81a3de3f209b9eb03adf96b7c740b2c7b52.zip
[SPARK-15493][SQL] default QuoteEscapingEnabled flag to true when writing CSV
## What changes were proposed in this pull request? Default QuoteEscapingEnabled flag to true when writing CSV and add an escapeQuotes option to be able to change this. See https://github.com/uniVocity/univocity-parsers/blob/f3eb2af26374940e60d91d1703bde54619f50c51/src/main/java/com/univocity/parsers/csv/CsvWriterSettings.java#L231-L247 This change is needed to be able to write RFC 4180 compatible CSV files (https://tools.ietf.org/html/rfc4180#section-2) https://issues.apache.org/jira/browse/SPARK-15493 ## How was this patch tested? Added a test that verifies the output is quoted correctly. Author: Jurriaan Pruis <email@jurriaanpruis.nl> Closes #13267 from jurriaan/quote-escaping.
-rw-r--r--dev/deps/spark-deps-hadoop-2.22
-rw-r--r--dev/deps/spark-deps-hadoop-2.32
-rw-r--r--dev/deps/spark-deps-hadoop-2.42
-rw-r--r--dev/deps/spark-deps-hadoop-2.62
-rw-r--r--dev/deps/spark-deps-hadoop-2.72
-rw-r--r--python/pyspark/sql/readwriter.py7
-rw-r--r--sql/core/pom.xml2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala3
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala1
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala51
11 files changed, 69 insertions, 7 deletions
diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2
index a9068da8b2..0ac1c00154 100644
--- a/dev/deps/spark-deps-hadoop-2.2
+++ b/dev/deps/spark-deps-hadoop-2.2
@@ -159,7 +159,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3
index 7e60a313ae..fa35fa7051 100644
--- a/dev/deps/spark-deps-hadoop-2.3
+++ b/dev/deps/spark-deps-hadoop-2.3
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4
index 70d33b4f48..99dffa93bb 100644
--- a/dev/deps/spark-deps-hadoop-2.4
+++ b/dev/deps/spark-deps-hadoop-2.4
@@ -167,7 +167,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xmlenc-0.52.jar
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a80f6bc2a4..a3bee36ce5 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -175,7 +175,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index c0b53f73cd..dbd7a8e0bf 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -176,7 +176,7 @@ stax-api-1.0.1.jar
stream-2.7.0.jar
stringtemplate-3.2.1.jar
super-csv-2.2.0.jar
-univocity-parsers-2.1.0.jar
+univocity-parsers-2.1.1.jar
validation-api-1.1.0.Final.jar
xbean-asm5-shaded-4.4.jar
xercesImpl-2.9.1.jar
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 6f788cf50c..73d2b81b6b 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -769,7 +769,7 @@ class DataFrameWriter(object):
@since(2.0)
def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
- header=None, nullValue=None):
+ header=None, nullValue=None, escapeQuotes=None):
"""Saves the content of the [[DataFrame]] in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
@@ -790,6 +790,9 @@ class DataFrameWriter(object):
value, ``"``.
:param escape: sets the single character used for escaping quotes inside an already
quoted value. If None is set, it uses the default value, ``\``
+ :param escapeQuotes: A flag indicating whether values containing quotes should always
+ be enclosed in quotes. If None is set, it uses the default value
+ ``true``, escaping all values containing a quote character.
:param header: writes the names of columns as the first line. If None is set, it uses
the default value, ``false``.
:param nullValue: sets the string representation of a null value. If None is set, it uses
@@ -810,6 +813,8 @@ class DataFrameWriter(object):
self.option("header", header)
if nullValue is not None:
self.option("nullValue", nullValue)
+ if escapeQuotes is not None:
+ self.option("escapeQuotes", nullValue)
self._jwrite.csv(path)
@since(1.5)
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 2ea980bf20..b833b9369e 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -39,7 +39,7 @@
<dependency>
<groupId>com.univocity</groupId>
<artifactId>univocity-parsers</artifactId>
- <version>2.1.0</version>
+ <version>2.1.1</version>
<type>jar</type>
</dependency>
<dependency>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 6f5fb69ea3..3aacce7d7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -684,6 +684,9 @@ final class DataFrameWriter private[sql](df: DataFrame) {
* the separator can be part of the value.</li>
* <li>`escape` (default `\`): sets the single character used for escaping quotes inside
* an already quoted value.</li>
+ * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
+ * quotes should always be enclosed in quotes. Default is to escape all values containing
+ * a quote character.</li>
* <li>`header` (default `false`): writes the names of columns as the first line.</li>
* <li>`nullValue` (default empty string): sets the string representation of a null value.</li>
* <li>`compression` (default `null`): compression codec to use when saving to file. This can be
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index e4fd09462f..9f4ce8358b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -111,6 +111,8 @@ private[sql] class CSVOptions(@transient private val parameters: Map[String, Str
val maxCharsPerColumn = getInt("maxCharsPerColumn", 1000000)
+ val escapeQuotes = getBool("escapeQuotes", true)
+
val inputBufferSize = 128
val isCommentSet = this.comment != '\u0000'
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index 111995da9c..b06f12369d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -75,6 +75,7 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten
writerSettings.setSkipEmptyLines(true)
writerSettings.setQuoteAllFields(false)
writerSettings.setHeaders(headers: _*)
+ writerSettings.setQuoteEscapingEnabled(params.escapeQuotes)
private var buffer = new ByteArrayOutputStream()
private var writer = new CsvWriter(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index bae290776f..ad7c05c12e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -366,6 +366,57 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
}
+ test("save csv with quote escaping enabled") {
+ withTempDir { dir =>
+ val csvDir = new File(dir, "csv").getCanonicalPath
+
+ val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+ val df = spark.createDataFrame(data)
+
+ // escapeQuotes should be true by default
+ df.coalesce(1).write
+ .format("csv")
+ .option("quote", "\"")
+ .option("escape", "\"")
+ .save(csvDir)
+
+ val results = spark.read
+ .format("text")
+ .load(csvDir)
+ .collect()
+
+ val expected = "\"test \"\"quote\"\"\",123,\"it \"\"works\"\"!\",\"\"\"very\"\" well\""
+
+ assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+ }
+ }
+
+ test("save csv with quote escaping disabled") {
+ withTempDir { dir =>
+ val csvDir = new File(dir, "csv").getCanonicalPath
+
+ val data = Seq(("test \"quote\"", 123, "it \"works\"!", "\"very\" well"))
+ val df = spark.createDataFrame(data)
+
+ // escapeQuotes should be true by default
+ df.coalesce(1).write
+ .format("csv")
+ .option("quote", "\"")
+ .option("escapeQuotes", "false")
+ .option("escape", "\"")
+ .save(csvDir)
+
+ val results = spark.read
+ .format("text")
+ .load(csvDir)
+ .collect()
+
+ val expected = "test \"quote\",123,it \"works\"!,\"\"\"very\"\" well\""
+
+ assert(results.toSeq.map(_.toSeq) === Seq(Seq(expected)))
+ }
+ }
+
test("commented lines in CSV data") {
val results = spark.read
.format("csv")