aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2017-04-12 09:16:39 +0100
committerSean Owen <sowen@cloudera.com>2017-04-12 09:16:39 +0100
commitbca4259f12b32eeb156b6755d0ec5e16d8e566b3 (patch)
treec7da055477f7498b6efebc60fe7a7f9a0c6ea353
parentb9384382484a9f5c6b389742e7fdf63865de81c0 (diff)
downloadspark-bca4259f12b32eeb156b6755d0ec5e16d8e566b3.tar.gz
spark-bca4259f12b32eeb156b6755d0ec5e16d8e566b3.tar.bz2
spark-bca4259f12b32eeb156b6755d0ec5e16d8e566b3.zip
[MINOR][DOCS] JSON APIs related documentation fixes
## What changes were proposed in this pull request? This PR proposes corrections related to JSON APIs as below: - Rendering links in Python documentation - Replacing `RDD` to `Dataset` in programing guide - Adding missing description about JSON Lines consistently in `DataFrameReader.json` in Python API - De-duplicating little bit of `DataFrameReader.json` in Scala/Java API ## How was this patch tested? Manually build the documentation via `jekyll build`. Corresponding snapstops will be left on the codes. Note that currently there are Javadoc8 breaks in several places. These are proposed to be handled in https://github.com/apache/spark/pull/17477. So, this PR does not fix those. Author: hyukjinkwon <gurwls223@gmail.com> Closes #17602 from HyukjinKwon/minor-json-documentation.
-rw-r--r--docs/sql-programming-guide.md4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala2
-rw-r--r--python/pyspark/sql/readwriter.py8
-rw-r--r--python/pyspark/sql/streaming.py4
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala4
6 files changed, 13 insertions, 11 deletions
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index c425faca4c..28942b68fa 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -883,7 +883,7 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
<div data-lang="scala" markdown="1">
Spark SQL can automatically infer the schema of a JSON dataset and load it as a `Dataset[Row]`.
-This conversion can be done using `SparkSession.read.json()` on either an RDD of String,
+This conversion can be done using `SparkSession.read.json()` on either a `Dataset[String]`,
or a JSON file.
Note that the file that is offered as _a json file_ is not a typical JSON file. Each
@@ -897,7 +897,7 @@ For a regular multi-line JSON file, set the `wholeFile` option to `true`.
<div data-lang="java" markdown="1">
Spark SQL can automatically infer the schema of a JSON dataset and load it as a `Dataset<Row>`.
-This conversion can be done using `SparkSession.read().json()` on either an RDD of String,
+This conversion can be done using `SparkSession.read().json()` on either a `Dataset<String>`,
or a JSON file.
Note that the file that is offered as _a json file_ is not a typical JSON file. Each
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
index 1a7054614b..b66abaed66 100644
--- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -215,7 +215,7 @@ public class JavaSQLDataSourceExample {
// +------+
// Alternatively, a DataFrame can be created for a JSON dataset represented by
- // an Dataset[String] storing one JSON object per string.
+ // a Dataset<String> storing one JSON object per string.
List<String> jsonData = Arrays.asList(
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
Dataset<String> anotherPeopleDataset = spark.createDataset(jsonData, Encoders.STRING());
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
index 82fd56de39..ad74da72bd 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -139,7 +139,7 @@ object SQLDataSourceExample {
// +------+
// Alternatively, a DataFrame can be created for a JSON dataset represented by
- // an Dataset[String] storing one JSON object per string
+ // a Dataset[String] storing one JSON object per string
val otherPeopleDataset = spark.createDataset(
"""{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
val otherPeople = spark.read.json(otherPeopleDataset)
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index d912f395da..960fb882cf 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -173,8 +173,8 @@ class DataFrameReader(OptionUtils):
"""
Loads JSON files and returns the results as a :class:`DataFrame`.
- `JSON Lines <http://jsonlines.org/>`_(newline-delimited JSON) is supported by default.
- For JSON (one record per file), set the `wholeFile` parameter to ``true``.
+ `JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
+ For JSON (one record per file), set the ``wholeFile`` parameter to ``true``.
If the ``schema`` parameter is not specified, this function goes
through the input once to determine the input schema.
@@ -634,7 +634,9 @@ class DataFrameWriter(OptionUtils):
@since(1.4)
def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None):
- """Saves the content of the :class:`DataFrame` in JSON format at the specified path.
+ """Saves the content of the :class:`DataFrame` in JSON format
+ (`JSON Lines text format or newline-delimited JSON <http://jsonlines.org/>`_) at the
+ specified path.
:param path: the path in any Hadoop supported file system
:param mode: specifies the behavior of the save operation when data already exists.
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 3b60496341..65b59d480d 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -405,8 +405,8 @@ class DataStreamReader(OptionUtils):
"""
Loads a JSON file stream and returns the results as a :class:`DataFrame`.
- `JSON Lines <http://jsonlines.org/>`_(newline-delimited JSON) is supported by default.
- For JSON (one record per file), set the `wholeFile` parameter to ``true``.
+ `JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
+ For JSON (one record per file), set the ``wholeFile`` parameter to ``true``.
If the ``schema`` parameter is not specified, this function goes
through the input once to determine the input schema.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 49691c15d0..c1b3291741 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -268,8 +268,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
}
/**
- * Loads a JSON file (<a href="http://jsonlines.org/">JSON Lines text format or
- * newline-delimited JSON</a>) and returns the result as a `DataFrame`.
+ * Loads a JSON file and returns the results as a `DataFrame`.
+ *
* See the documentation on the overloaded `json()` method with varargs for more details.
*
* @since 1.4.0