aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-04-08 00:28:59 -0700
committerReynold Xin <rxin@databricks.com>2016-04-08 00:28:59 -0700
commit725b860e2b7b675d95b10c46f2b329c30cd21faf (patch)
treefe3191cbdf6b58ea4c993c7d02691758f574423f /sql
parent04fb7dba704afa4e20eb8c72d6568f7f55694157 (diff)
downloadspark-725b860e2b7b675d95b10c46f2b329c30cd21faf.tar.gz
spark-725b860e2b7b675d95b10c46f2b329c30cd21faf.tar.bz2
spark-725b860e2b7b675d95b10c46f2b329c30cd21faf.zip
[SPARK-14103][SQL] Parse unescaped quotes in CSV data source.
## What changes were proposed in this pull request? This PR resolves the problem during parsing unescaped quotes in input data. For example, currently the data below: ``` "a"b,ccc,ddd e,f,g ``` produces a data below: - **Before** ```bash ["a"b,ccc,ddd[\n]e,f,g] <- as a value. ``` - **After** ```bash ["a"b], [ccc], [ddd] [e], [f], [g] ``` This PR bumps up the Univocity parser's version. This was fixed in `2.0.2`, https://github.com/uniVocity/univocity-parsers/issues/60. ## How was this patch tested? Unit tests in `CSVSuite` and `sbt/sbt scalastyle`. Author: hyukjinkwon <gurwls223@gmail.com> Closes #12226 from HyukjinKwon/SPARK-14103-quote.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/pom.xml2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala1
-rw-r--r--sql/core/src/test/resources/unescaped-quotes.csv2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala12
4 files changed, 16 insertions, 1 deletions
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 708670b292..8b1017042c 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -39,7 +39,7 @@
<dependency>
<groupId>com.univocity</groupId>
<artifactId>univocity-parsers</artifactId>
- <version>1.5.6</version>
+ <version>2.0.2</version>
<type>jar</type>
</dependency>
<dependency>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
index 5570b2c173..c3d863f547 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala
@@ -47,6 +47,7 @@ private[sql] abstract class CsvReader(params: CSVOptions, headers: Seq[String])
settings.setMaxColumns(params.maxColumns)
settings.setNullValue(params.nullValue)
settings.setMaxCharsPerColumn(params.maxCharsPerColumn)
+ settings.setParseUnescapedQuotesUntilDelimiter(true)
if (headers != null) settings.setHeaders(headers: _*)
new CsvParser(settings)
diff --git a/sql/core/src/test/resources/unescaped-quotes.csv b/sql/core/src/test/resources/unescaped-quotes.csv
new file mode 100644
index 0000000000..7c68055575
--- /dev/null
+++ b/sql/core/src/test/resources/unescaped-quotes.csv
@@ -0,0 +1,2 @@
+"a"b,ccc,ddd
+ab,cc"c,ddd"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 58d9d69d9a..9baae80f15 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -45,6 +45,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
private val disableCommentsFile = "disable_comments.csv"
private val boolFile = "bool.csv"
private val simpleSparseFile = "simple_sparse.csv"
+ private val unescapedQuotesFile = "unescaped-quotes.csv"
private def testFile(fileName: String): String = {
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
@@ -140,6 +141,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
verifyCars(cars, withHeader = true)
}
+ test("parse unescaped quotes with maxCharsPerColumn") {
+ val rows = sqlContext.read
+ .format("csv")
+ .option("maxCharsPerColumn", "4")
+ .load(testFile(unescapedQuotesFile))
+
+ val expectedRows = Seq(Row("\"a\"b", "ccc", "ddd"), Row("ab", "cc\"c", "ddd\""))
+
+ checkAnswer(rows, expectedRows)
+ }
+
test("bad encoding name") {
val exception = intercept[UnsupportedCharsetException] {
sqlContext