aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorTakuya UESHIN <ueshin@databricks.com>2017-03-14 13:57:23 -0700
committerXiao Li <gatorsmile@gmail.com>2017-03-14 13:57:23 -0700
commit7ded39c223429265b23940ca8244660dbee8320c (patch)
treee52bc19910347af47ecf24deaae2431866d043b4 /python
parent6eac96823c7b244773bd810812b369e336a65837 (diff)
downloadspark-7ded39c223429265b23940ca8244660dbee8320c.tar.gz
spark-7ded39c223429265b23940ca8244660dbee8320c.tar.bz2
spark-7ded39c223429265b23940ca8244660dbee8320c.zip
[SPARK-19817][SQL] Make it clear that `timeZone` option is a general option in DataFrameReader/Writer.
## What changes were proposed in this pull request? As timezone setting can also affect partition values, it works for all formats, we should make it clear. ## How was this patch tested? Existing tests. Author: Takuya UESHIN <ueshin@databricks.com> Closes #17281 from ueshin/issues/SPARK-19817.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/readwriter.py46
1 files changed, 28 insertions, 18 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 4354345ebc..705803791d 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -109,6 +109,11 @@ class DataFrameReader(OptionUtils):
@since(1.5)
def option(self, key, value):
"""Adds an input option for the underlying data source.
+
+ You can set the following option(s) for reading files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
+ in the JSON/CSV datasources or parttion values.
+ If it isn't set, it uses the default value, session local timezone.
"""
self._jreader = self._jreader.option(key, to_str(value))
return self
@@ -116,6 +121,11 @@ class DataFrameReader(OptionUtils):
@since(1.4)
def options(self, **options):
"""Adds input options for the underlying data source.
+
+ You can set the following option(s) for reading files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
+ in the JSON/CSV datasources or parttion values.
+ If it isn't set, it uses the default value, session local timezone.
"""
for k in options:
self._jreader = self._jreader.option(k, to_str(options[k]))
@@ -159,7 +169,7 @@ class DataFrameReader(OptionUtils):
allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,
allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,
mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,
- timeZone=None, wholeFile=None):
+ wholeFile=None):
"""
Loads JSON files and returns the results as a :class:`DataFrame`.
@@ -214,8 +224,6 @@ class DataFrameReader(OptionUtils):
formats follow the formats at ``java.text.SimpleDateFormat``.
This applies to timestamp type. If None is set, it uses the
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSZZ``.
- :param timeZone: sets the string that indicates a timezone to be used to parse timestamps.
- If None is set, it uses the default value, session local timezone.
:param wholeFile: parse one record, which may span multiple lines, per file. If None is
set, it uses the default value, ``false``.
@@ -234,7 +242,7 @@ class DataFrameReader(OptionUtils):
allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,
allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,
mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,
- timestampFormat=timestampFormat, timeZone=timeZone, wholeFile=wholeFile)
+ timestampFormat=timestampFormat, wholeFile=wholeFile)
if isinstance(path, basestring):
path = [path]
if type(path) == list:
@@ -307,7 +315,7 @@ class DataFrameReader(OptionUtils):
comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,
ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
- maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, timeZone=None,
+ maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
columnNameOfCorruptRecord=None, wholeFile=None):
"""Loads a CSV file and returns the result as a :class:`DataFrame`.
@@ -367,8 +375,6 @@ class DataFrameReader(OptionUtils):
uses the default value, ``10``.
:param mode: allows a mode for dealing with corrupt records during parsing. If None is
set, it uses the default value, ``PERMISSIVE``.
- :param timeZone: sets the string that indicates a timezone to be used to parse timestamps.
- If None is set, it uses the default value, session local timezone.
* ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted \
record, and puts the malformed string into a field configured by \
@@ -399,7 +405,7 @@ class DataFrameReader(OptionUtils):
nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf,
dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
maxCharsPerColumn=maxCharsPerColumn,
- maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, timeZone=timeZone,
+ maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
columnNameOfCorruptRecord=columnNameOfCorruptRecord, wholeFile=wholeFile)
if isinstance(path, basestring):
path = [path]
@@ -521,6 +527,11 @@ class DataFrameWriter(OptionUtils):
@since(1.5)
def option(self, key, value):
"""Adds an output option for the underlying data source.
+
+ You can set the following option(s) for writing files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to format
+ timestamps in the JSON/CSV datasources or parttion values.
+ If it isn't set, it uses the default value, session local timezone.
"""
self._jwrite = self._jwrite.option(key, to_str(value))
return self
@@ -528,6 +539,11 @@ class DataFrameWriter(OptionUtils):
@since(1.4)
def options(self, **options):
"""Adds output options for the underlying data source.
+
+ You can set the following option(s) for writing files:
+ * ``timeZone``: sets the string that indicates a timezone to be used to format
+ timestamps in the JSON/CSV datasources or parttion values.
+ If it isn't set, it uses the default value, session local timezone.
"""
for k in options:
self._jwrite = self._jwrite.option(k, to_str(options[k]))
@@ -619,8 +635,7 @@ class DataFrameWriter(OptionUtils):
self._jwrite.saveAsTable(name)
@since(1.4)
- def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None,
- timeZone=None):
+ def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None):
"""Saves the content of the :class:`DataFrame` in JSON format at the specified path.
:param path: the path in any Hadoop supported file system
@@ -641,15 +656,12 @@ class DataFrameWriter(OptionUtils):
formats follow the formats at ``java.text.SimpleDateFormat``.
This applies to timestamp type. If None is set, it uses the
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSZZ``.
- :param timeZone: sets the string that indicates a timezone to be used to format timestamps.
- If None is set, it uses the default value, session local timezone.
>>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)
self._set_opts(
- compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat,
- timeZone=timeZone)
+ compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat)
self._jwrite.json(path)
@since(1.4)
@@ -696,7 +708,7 @@ class DataFrameWriter(OptionUtils):
@since(2.0)
def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
- timestampFormat=None, timeZone=None):
+ timestampFormat=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
@@ -736,15 +748,13 @@ class DataFrameWriter(OptionUtils):
formats follow the formats at ``java.text.SimpleDateFormat``.
This applies to timestamp type. If None is set, it uses the
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSZZ``.
- :param timeZone: sets the string that indicates a timezone to be used to parse timestamps.
- If None is set, it uses the default value, session local timezone.
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)
self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header,
nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll,
- dateFormat=dateFormat, timestampFormat=timestampFormat, timeZone=timeZone)
+ dateFormat=dateFormat, timestampFormat=timestampFormat)
self._jwrite.csv(path)
@since(1.5)