aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLiwei Lin <lwlin7@gmail.com>2017-03-14 22:30:16 -0700
committerTakuya UESHIN <ueshin@happy-camper.st>2017-03-14 22:30:16 -0700
commite1ac553402ab82bbc72fd64e5943b71c16b4b37d (patch)
tree335764b1b30d9245115f1debb32eff3ffa10e4db
parentf9a93b1b4a20e7c72d900362b269edab66e73dd8 (diff)
downloadspark-e1ac553402ab82bbc72fd64e5943b71c16b4b37d.tar.gz
spark-e1ac553402ab82bbc72fd64e5943b71c16b4b37d.tar.bz2
spark-e1ac553402ab82bbc72fd64e5943b71c16b4b37d.zip
[SPARK-19817][SS] Make it clear that `timeZone` is a general option in DataStreamReader/Writer
## What changes were proposed in this pull request? As timezone setting can also affect partition values, it works for all formats, we should make it clear. ## How was this patch tested? N/A Author: Liwei Lin <lwlin7@gmail.com> Closes #17299 from lw-lin/timezone.
-rw-r--r--python/pyspark/sql/readwriter.py8
-rw-r--r--python/pyspark/sql/streaming.py32
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala6
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala6
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala22
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala18
6 files changed, 70 insertions, 22 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 705803791d..122e17f202 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -112,7 +112,7 @@ class DataFrameReader(OptionUtils):
You can set the following option(s) for reading files:
* ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
- in the JSON/CSV datasources or parttion values.
+ in the JSON/CSV datasources or partition values.
If it isn't set, it uses the default value, session local timezone.
"""
self._jreader = self._jreader.option(key, to_str(value))
@@ -124,7 +124,7 @@ class DataFrameReader(OptionUtils):
You can set the following option(s) for reading files:
* ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
- in the JSON/CSV datasources or parttion values.
+ in the JSON/CSV datasources or partition values.
If it isn't set, it uses the default value, session local timezone.
"""
for k in options:
@@ -530,7 +530,7 @@ class DataFrameWriter(OptionUtils):
You can set the following option(s) for writing files:
* ``timeZone``: sets the string that indicates a timezone to be used to format
- timestamps in the JSON/CSV datasources or parttion values.
+ timestamps in the JSON/CSV datasources or partition values.
If it isn't set, it uses the default value, session local timezone.
"""
self._jwrite = self._jwrite.option(key, to_str(value))
@@ -542,7 +542,7 @@ class DataFrameWriter(OptionUtils):
You can set the following option(s) for writing files:
* ``timeZone``: sets the string that indicates a timezone to be used to format
- timestamps in the JSON/CSV datasources or parttion values.
+ timestamps in the JSON/CSV datasources or partition values.
If it isn't set, it uses the default value, session local timezone.
"""
for k in options:
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 625fb9ba38..288cc1e4f6 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -373,6 +373,11 @@ class DataStreamReader(OptionUtils):
def option(self, key, value):
"""Adds an input option for the underlying data source.
+ You can set the following option(s) for reading files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
+ in the JSON/CSV datasources or partition values.
+ If it isn't set, it uses the default value, session local timezone.
+
.. note:: Experimental.
>>> s = spark.readStream.option("x", 1)
@@ -384,6 +389,11 @@ class DataStreamReader(OptionUtils):
def options(self, **options):
"""Adds input options for the underlying data source.
+ You can set the following option(s) for reading files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
+ in the JSON/CSV datasources or partition values.
+ If it isn't set, it uses the default value, session local timezone.
+
.. note:: Experimental.
>>> s = spark.readStream.options(x="1", y=2)
@@ -429,7 +439,7 @@ class DataStreamReader(OptionUtils):
allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
- timeZone=None, wholeFile=None):
+ wholeFile=None):
"""
Loads a JSON file stream and returns the results as a :class:`DataFrame`.
@@ -486,8 +496,6 @@ class DataStreamReader(OptionUtils):
formats follow the formats at ``java.text.SimpleDateFormat``.
This applies to timestamp type. If None is set, it uses the
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSZZ``.
- :param timeZone: sets the string that indicates a timezone to be used to parse timestamps.
- If None is set, it uses the default value, session local timezone.
:param wholeFile: parse one record, which may span multiple lines, per file. If None is
set, it uses the default value, ``false``.
@@ -503,7 +511,7 @@ class DataStreamReader(OptionUtils):
allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,
allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
- timestampFormat=timestampFormat, timeZone=timeZone, wholeFile=wholeFile)
+ timestampFormat=timestampFormat, wholeFile=wholeFile)
if isinstance(path, basestring):
return self._df(self._jreader.json(path))
else:
@@ -561,7 +569,7 @@ class DataStreamReader(OptionUtils):
comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,
ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
- maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, timeZone=None,
+ maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, wholeFile=None):
"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
@@ -619,8 +627,6 @@ class DataStreamReader(OptionUtils):
``-1`` meaning unlimited length.
:param mode: allows a mode for dealing with corrupt records during parsing. If None is
set, it uses the default value, ``PERMISSIVE``.
- :param timeZone: sets the string that indicates a timezone to be used to parse timestamps.
- If None is set, it uses the default value, session local timezone.
* ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \
record, and puts the malformed string into a field configured by \
@@ -653,7 +659,7 @@ class DataStreamReader(OptionUtils):
nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf,
dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
maxCharsPerColumn=maxCharsPerColumn,
- maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, timeZone=timeZone,
+ maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
columnNameOfCorruptRecord=columnNameOfCorruptRecord, wholeFile=wholeFile)
if isinstance(path, basestring):
return self._df(self._jreader.csv(path))
@@ -721,6 +727,11 @@ class DataStreamWriter(object):
def option(self, key, value):
"""Adds an output option for the underlying data source.
+ You can set the following option(s) for writing files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to format
+ timestamps in the JSON/CSV datasources or partition values.
+ If it isn't set, it uses the default value, session local timezone.
+
.. note:: Experimental.
"""
self._jwrite = self._jwrite.option(key, to_str(value))
@@ -730,6 +741,11 @@ class DataStreamWriter(object):
def options(self, **options):
"""Adds output options for the underlying data source.
+ You can set the following option(s) for writing files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to format
+ timestamps in the JSON/CSV datasources or partition values.
+ If it isn't set, it uses the default value, session local timezone.
+
.. note:: Experimental.
"""
for k in options:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 309654c804..88fbfb4c92 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -73,7 +73,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
* You can set the following option(s):
* <ul>
* <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to parse timestamps in the JSON/CSV datasources or parttion values.</li>
+ * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
* </ul>
*
* @since 1.4.0
@@ -110,7 +110,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
* You can set the following option(s):
* <ul>
* <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to parse timestamps in the JSON/CSV datasources or parttion values.</li>
+ * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
* </ul>
*
* @since 1.4.0
@@ -126,7 +126,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
* You can set the following option(s):
* <ul>
* <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to parse timestamps in the JSON/CSV datasources or parttion values.</li>
+ * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
* </ul>
*
* @since 1.4.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 608160a214..deaa800694 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -93,7 +93,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* You can set the following option(s):
* <ul>
* <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to format timestamps in the JSON/CSV datasources or parttion values.</li>
+ * to be used to format timestamps in the JSON/CSV datasources or partition values.</li>
* </ul>
*
* @since 1.4.0
@@ -130,7 +130,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* You can set the following option(s):
* <ul>
* <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to format timestamps in the JSON/CSV datasources or parttion values.</li>
+ * to be used to format timestamps in the JSON/CSV datasources or partition values.</li>
* </ul>
*
* @since 1.4.0
@@ -146,7 +146,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
* You can set the following option(s):
* <ul>
* <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to format timestamps in the JSON/CSV datasources or parttion values.</li>
+ * to be used to format timestamps in the JSON/CSV datasources or partition values.</li>
* </ul>
*
* @since 1.4.0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index aed8074a64..388ef182ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -61,6 +61,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
/**
* Adds an input option for the underlying data source.
*
+ * You can set the following option(s):
+ * <ul>
+ * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
+ * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+ * </ul>
+ *
* @since 2.0.0
*/
def option(key: String, value: String): DataStreamReader = {
@@ -92,6 +98,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
/**
* (Scala-specific) Adds input options for the underlying data source.
*
+ * You can set the following option(s):
+ * <ul>
+ * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
+ * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+ * </ul>
+ *
* @since 2.0.0
*/
def options(options: scala.collection.Map[String, String]): DataStreamReader = {
@@ -102,6 +114,12 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
/**
* Adds input options for the underlying data source.
*
+ * You can set the following option(s):
+ * <ul>
+ * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
+ * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+ * </ul>
+ *
* @since 2.0.0
*/
def options(options: java.util.Map[String, String]): DataStreamReader = {
@@ -186,8 +204,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
* <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that
* indicates a timestamp format. Custom date formats follow the formats at
* `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
- * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to parse timestamps.</li>
* <li>`wholeFile` (default `false`): parse one record, which may span multiple lines,
* per file</li>
* </ul>
@@ -239,8 +255,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
* <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSZZ`): sets the string that
* indicates a timestamp format. Custom date formats follow the formats at
* `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
- * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
- * to be used to parse timestamps.</li>
* <li>`maxColumns` (default `20480`): defines a hard limit of how many columns
* a record can have.</li>
* <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index c8fda8cd83..fe52013bad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -145,6 +145,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
/**
* Adds an output option for the underlying data source.
*
+ * You can set the following option(s):
+ * <ul>
+ * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
+ * to be used to format timestamps in the JSON/CSV datasources or partition values.</li>
+ * </ul>
+ *
* @since 2.0.0
*/
def option(key: String, value: String): DataStreamWriter[T] = {
@@ -176,6 +182,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
/**
* (Scala-specific) Adds output options for the underlying data source.
*
+ * You can set the following option(s):
+ * <ul>
+ * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
+ * to be used to format timestamps in the JSON/CSV datasources or partition values.</li>
+ * </ul>
+ *
* @since 2.0.0
*/
def options(options: scala.collection.Map[String, String]): DataStreamWriter[T] = {
@@ -186,6 +198,12 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
/**
* Adds output options for the underlying data source.
*
+ * You can set the following option(s):
+ * <ul>
+ * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
+ * to be used to format timestamps in the JSON/CSV datasources or partition values.</li>
+ * </ul>
+ *
* @since 2.0.0
*/
def options(options: java.util.Map[String, String]): DataStreamWriter[T] = {