aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCazen <Cazen@korea.com>2016-01-03 17:01:19 -0800
committerReynold Xin <rxin@databricks.com>2016-01-03 17:01:19 -0800
commitb8410ff9ce8cef7159a7364272e4c4234c5b474f (patch)
tree2da8cdd8496644f9d71937cddca54844c6db1ee2
parent7b92922f7f7ba4ff398dcbd734e8305ba03da87b (diff)
downloadspark-b8410ff9ce8cef7159a7364272e4c4234c5b474f.tar.gz
spark-b8410ff9ce8cef7159a7364272e4c4234c5b474f.tar.bz2
spark-b8410ff9ce8cef7159a7364272e4c4234c5b474f.zip
[SPARK-12537][SQL] Add option to accept quoting of all character backslash quoting mechanism
We can provides the option to choose JSON parser can be enabled to accept quoting of all character or not. Author: Cazen <Cazen@korea.com> Author: Cazen Lee <cazen.lee@samsung.com> Author: Cazen Lee <Cazen@korea.com> Author: cazen.lee <cazen.lee@samsung.com> Closes #10497 from Cazen/master.
-rw-r--r--python/pyspark/sql/readwriter.py2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala9
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala19
4 files changed, 30 insertions, 2 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index a3d7eca04b..a2771daabe 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -160,6 +160,8 @@ class DataFrameReader(object):
quotes
* ``allowNumericLeadingZeros`` (default ``false``): allows leading zeros in numbers \
(e.g. 00012)
+ * ``allowBackslashEscapingAnyCharacter`` (default ``false``): allows accepting quoting \
+ of all character using backslash quoting mechanism
>>> df1 = sqlContext.read.json('python/test_support/sql/people.json')
>>> df1.dtypes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 0acea95344..6debb302d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -258,6 +258,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
* </li>
* <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
* (e.g. 00012)</li>
+ * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
+ * character using backslash quoting mechanism</li>
*
* @since 1.6.0
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
index c132ead20e..f805c00925 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
@@ -31,7 +31,8 @@ case class JSONOptions(
allowUnquotedFieldNames: Boolean = false,
allowSingleQuotes: Boolean = true,
allowNumericLeadingZeros: Boolean = false,
- allowNonNumericNumbers: Boolean = false) {
+ allowNonNumericNumbers: Boolean = false,
+ allowBackslashEscapingAnyCharacter: Boolean = false) {
/** Sets config options on a Jackson [[JsonFactory]]. */
def setJacksonOptions(factory: JsonFactory): Unit = {
@@ -40,6 +41,8 @@ case class JSONOptions(
factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes)
factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros)
factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
+ factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
+ allowBackslashEscapingAnyCharacter)
}
}
@@ -59,6 +62,8 @@ object JSONOptions {
allowNumericLeadingZeros =
parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false),
allowNonNumericNumbers =
- parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true)
+ parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true),
+ allowBackslashEscapingAnyCharacter =
+ parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false)
)
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
index 4cc0a3a958..1742df31bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala
@@ -111,4 +111,23 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
assert(df.schema.head.name == "age")
assert(df.first().getDouble(0).isNaN)
}
+
+ test("allowBackslashEscapingAnyCharacter off") {
+ val str = """{"name": "Cazen Lee", "price": "\$10"}"""
+ val rdd = sqlContext.sparkContext.parallelize(Seq(str))
+ val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "false").json(rdd)
+
+ assert(df.schema.head.name == "_corrupt_record")
+ }
+
+ test("allowBackslashEscapingAnyCharacter on") {
+ val str = """{"name": "Cazen Lee", "price": "\$10"}"""
+ val rdd = sqlContext.sparkContext.parallelize(Seq(str))
+ val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "true").json(rdd)
+
+ assert(df.schema.head.name == "name")
+ assert(df.schema.last.name == "price")
+ assert(df.first().getString(0) == "Cazen Lee")
+ assert(df.first().getString(1) == "$10")
+ }
}