aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-11-01 12:46:41 -0700
committerMichael Armbrust <michael@databricks.com>2016-11-01 12:46:41 -0700
commit01dd0083011741c2bbe5ae1d2a25f2c9a1302b76 (patch)
tree7b9993165b1a4f48e64d566d93c7883a3096403d /sql
parentcfac17ee1cec414663b957228e469869eb7673c1 (diff)
downloadspark-01dd0083011741c2bbe5ae1d2a25f2c9a1302b76.tar.gz
spark-01dd0083011741c2bbe5ae1d2a25f2c9a1302b76.tar.bz2
spark-01dd0083011741c2bbe5ae1d2a25f2c9a1302b76.zip
[SPARK-17764][SQL] Add `to_json` supporting to convert nested struct column to JSON string
## What changes were proposed in this pull request? This PR proposes to add `to_json` function in contrast with `from_json` in Scala, Java and Python. It'd be useful if we can convert a same column from/to json. Also, some datasources do not support nested types. If we are forced to save a dataframe into those data sources, we might be able to work around by this function. The usage is as below: ``` scala val df = Seq(Tuple1(Tuple1(1))).toDF("a") df.select(to_json($"a").as("json")).show() ``` ``` bash +--------+ | json| +--------+ |{"_1":1}| +--------+ ``` ## How was this patch tested? Unit tests in `JsonFunctionsSuite` and `JsonExpressionsSuite`. Author: hyukjinkwon <gurwls223@gmail.com> Closes #15354 from HyukjinKwon/SPARK-17764.
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala48
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala (renamed from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala)5
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala26
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala9
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/functions.scala44
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala30
8 files changed, 152 insertions, 14 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 65dbd6a4e3..244a5a34f3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -17,16 +17,17 @@
package org.apache.spark.sql.catalyst.expressions
-import java.io.{ByteArrayOutputStream, StringWriter}
+import java.io.{ByteArrayOutputStream, CharArrayWriter, StringWriter}
import scala.util.parsing.combinator.RegexParsers
import com.fasterxml.jackson.core._
+import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.json.{JacksonParser, JSONOptions, SparkSQLJsonProcessingException}
+import org.apache.spark.sql.catalyst.json._
import org.apache.spark.sql.catalyst.util.ParseModes
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@@ -494,3 +495,46 @@ case class JsonToStruct(schema: StructType, options: Map[String, String], child:
override def inputTypes: Seq[AbstractDataType] = StringType :: Nil
}
+
+/**
+ * Converts a [[StructType]] to a json output string.
+ */
+case class StructToJson(options: Map[String, String], child: Expression)
+ extends Expression with CodegenFallback with ExpectsInputTypes {
+ override def nullable: Boolean = true
+
+ @transient
+ lazy val writer = new CharArrayWriter()
+
+ @transient
+ lazy val gen =
+ new JacksonGenerator(child.dataType.asInstanceOf[StructType], writer)
+
+ override def dataType: DataType = StringType
+ override def children: Seq[Expression] = child :: Nil
+
+ override def checkInputDataTypes(): TypeCheckResult = {
+ if (StructType.acceptsType(child.dataType)) {
+ try {
+ JacksonUtils.verifySchema(child.dataType.asInstanceOf[StructType])
+ TypeCheckResult.TypeCheckSuccess
+ } catch {
+ case e: UnsupportedOperationException =>
+ TypeCheckResult.TypeCheckFailure(e.getMessage)
+ }
+ } else {
+ TypeCheckResult.TypeCheckFailure(
+ s"$prettyName requires that the expression is a struct expression.")
+ }
+ }
+
+ override def eval(input: InternalRow): Any = {
+ gen.write(child.eval(input).asInstanceOf[InternalRow])
+ gen.flush()
+ val json = writer.toString
+ writer.reset()
+ UTF8String.fromString(json)
+ }
+
+ override def inputTypes: Seq[AbstractDataType] = StructType :: Nil
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index 5b55b70186..4b548e0e7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -15,15 +15,14 @@
* limitations under the License.
*/
-package org.apache.spark.sql.execution.datasources.json
+package org.apache.spark.sql.catalyst.json
import java.io.Writer
import com.fasterxml.jackson.core._
-import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.json.JSONOptions
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils, MapData}
import org.apache.spark.sql.types._
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala
index c4d9abb2c0..3b23c6cd28 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonUtils.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.json
import com.fasterxml.jackson.core.{JsonParser, JsonToken}
+import org.apache.spark.sql.types._
+
object JacksonUtils {
/**
* Advance the parser until a null or a specific token is found
@@ -29,4 +31,28 @@ object JacksonUtils {
case x => x != stopOn
}
}
+
+ /**
+ * Verify if the schema is supported in JSON parsing.
+ */
+ def verifySchema(schema: StructType): Unit = {
+ def verifyType(name: String, dataType: DataType): Unit = dataType match {
+ case NullType | BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType |
+ DoubleType | StringType | TimestampType | DateType | BinaryType | _: DecimalType =>
+
+ case st: StructType => st.foreach(field => verifyType(field.name, field.dataType))
+
+ case at: ArrayType => verifyType(name, at.elementType)
+
+ case mt: MapType => verifyType(name, mt.keyType)
+
+ case udt: UserDefinedType[_] => verifyType(name, udt.sqlType)
+
+ case _ =>
+ throw new UnsupportedOperationException(
+ s"Unable to convert column $name of type ${dataType.simpleString} to JSON.")
+ }
+
+ schema.foreach(field => verifyType(field.name, field.dataType))
+ }
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
index 84623934d9..f9db649bc2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala
@@ -343,4 +343,13 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
null
)
}
+
+ test("to_json") {
+ val schema = StructType(StructField("a", IntegerType) :: Nil)
+ val struct = Literal.create(create_row(1), schema)
+ checkEvaluation(
+ StructToJson(Map.empty, struct),
+ """{"a":1}"""
+ )
+ }
}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 6e0a2471e0..eb2b20afc3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.encoders._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.catalyst.json.JacksonGenerator
import org.apache.spark.sql.catalyst.optimizer.CombineUnions
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
@@ -45,7 +46,6 @@ import org.apache.spark.sql.catalyst.util.usePrettyExpression
import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution}
import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand, GlobalTempView, LocalTempView}
import org.apache.spark.sql.execution.datasources.LogicalRelation
-import org.apache.spark.sql.execution.datasources.json.JacksonGenerator
import org.apache.spark.sql.execution.python.EvaluatePython
import org.apache.spark.sql.streaming.DataStreamWriter
import org.apache.spark.sql.types._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
index 5a409c04c9..0e38aefecb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -32,7 +32,7 @@ import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.json.{JacksonParser, JSONOptions}
+import org.apache.spark.sql.catalyst.json.{JacksonGenerator, JacksonParser, JSONOptions}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.execution.datasources.text.TextOutputWriter
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5f1efd22d8..944a476114 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2883,10 +2883,10 @@ object functions {
* (Scala-specific) Parses a column containing a JSON string into a [[StructType]] with the
* specified schema. Returns `null`, in the case of an unparseable string.
*
+ * @param e a string column containing JSON data.
* @param schema the schema to use when parsing the json string
* @param options options to control how the json is parsed. accepts the same options and the
* json data source.
- * @param e a string column containing JSON data.
*
* @group collection_funcs
* @since 2.1.0
@@ -2936,6 +2936,48 @@ object functions {
def from_json(e: Column, schema: String, options: java.util.Map[String, String]): Column =
from_json(e, DataType.fromJson(schema).asInstanceOf[StructType], options)
+
+ /**
+ * (Scala-specific) Converts a column containing a [[StructType]] into a JSON string with the
+ * specified schema. Throws an exception, in the case of an unsupported type.
+ *
+ * @param e a struct column.
+ * @param options options to control how the struct column is converted into a json string.
+ * accepts the same options and the json data source.
+ *
+ * @group collection_funcs
+ * @since 2.1.0
+ */
+ def to_json(e: Column, options: Map[String, String]): Column = withExpr {
+ StructToJson(options, e.expr)
+ }
+
+ /**
+ * (Java-specific) Converts a column containing a [[StructType]] into a JSON string with the
+ * specified schema. Throws an exception, in the case of an unsupported type.
+ *
+ * @param e a struct column.
+ * @param options options to control how the struct column is converted into a json string.
+ * accepts the same options and the json data source.
+ *
+ * @group collection_funcs
+ * @since 2.1.0
+ */
+ def to_json(e: Column, options: java.util.Map[String, String]): Column =
+ to_json(e, options.asScala.toMap)
+
+ /**
+ * Converts a column containing a [[StructType]] into a JSON string with the
+ * specified schema. Throws an exception, in the case of an unsupported type.
+ *
+ * @param e a struct column.
+ *
+ * @group collection_funcs
+ * @since 2.1.0
+ */
+ def to_json(e: Column): Column =
+ to_json(e, Map.empty[String, String])
+
/**
* Returns length of array or map.
*
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index 518d6e92b2..59ae889cf3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -17,9 +17,9 @@
package org.apache.spark.sql
-import org.apache.spark.sql.functions.from_json
+import org.apache.spark.sql.functions.{from_json, struct, to_json}
import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.types.{CalendarIntervalType, IntegerType, StructType}
class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
import testImplicits._
@@ -31,7 +31,6 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
Row("alice", "5"))
}
-
val tuples: Seq[(String, String)] =
("1", """{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") ::
("2", """{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") ::
@@ -97,7 +96,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
checkAnswer(expr, expected)
}
- test("json_parser") {
+ test("from_json") {
val df = Seq("""{"a": 1}""").toDS()
val schema = new StructType().add("a", IntegerType)
@@ -106,7 +105,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
Row(Row(1)) :: Nil)
}
- test("json_parser missing columns") {
+ test("from_json missing columns") {
val df = Seq("""{"a": 1}""").toDS()
val schema = new StructType().add("b", IntegerType)
@@ -115,7 +114,7 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
Row(Row(null)) :: Nil)
}
- test("json_parser invalid json") {
+ test("from_json invalid json") {
val df = Seq("""{"a" 1}""").toDS()
val schema = new StructType().add("a", IntegerType)
@@ -123,4 +122,23 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
df.select(from_json($"value", schema)),
Row(null) :: Nil)
}
+
+ test("to_json") {
+ val df = Seq(Tuple1(Tuple1(1))).toDF("a")
+
+ checkAnswer(
+ df.select(to_json($"a")),
+ Row("""{"_1":1}""") :: Nil)
+ }
+
+ test("to_json unsupported type") {
+ val df = Seq(Tuple1(Tuple1("interval -3 month 7 hours"))).toDF("a")
+ .select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c"))
+ val e = intercept[AnalysisException]{
+ // Unsupported type throws an exception
+ df.select(to_json($"c")).collect()
+ }
+ assert(e.getMessage.contains(
+ "Unable to convert column a of type calendarinterval to JSON."))
+ }
}