aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2016-06-05 23:40:13 -0700
committerReynold Xin <rxin@databricks.com>2016-06-05 23:40:13 -0700
commit32f2f95dbdfb21491e46d4b608fd4e8ac7ab8973 (patch)
tree06ad80291a4e1d9e008837be4af2e3b7944e46a8 /python
parentb7e8d1cb3ce932ba4a784be59744af8a8ef027ce (diff)
downloadspark-32f2f95dbdfb21491e46d4b608fd4e8ac7ab8973.tar.gz
spark-32f2f95dbdfb21491e46d4b608fd4e8ac7ab8973.tar.bz2
spark-32f2f95dbdfb21491e46d4b608fd4e8ac7ab8973.zip
Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour"
This reverts commit b7e8d1cb3ce932ba4a784be59744af8a8ef027ce.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/readwriter.py81
1 files changed, 42 insertions, 39 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 19aa8ddd06..9208a527d2 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -303,11 +303,10 @@ class DataFrameReader(object):
return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path)))
@since(2.0)
- def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"', escape=u'\\',
- comment=None, header='false', ignoreLeadingWhiteSpace='false',
- ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN', positiveInf='Inf',
- negativeInf='Inf', dateFormat=None, maxColumns='20480', maxCharsPerColumn='1000000',
- mode='PERMISSIVE'):
+ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None,
+ comment=None, header=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
+ nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None,
+ maxColumns=None, maxCharsPerColumn=None, mode=None):
"""Loads a CSV file and returns the result as a [[DataFrame]].
This function goes through the input once to determine the input schema. To avoid going
@@ -316,41 +315,44 @@ class DataFrameReader(object):
:param path: string, or list of strings, for input path(s).
:param schema: an optional :class:`StructType` for the input schema.
:param sep: sets the single character as a separator for each field and value.
- The default value is ``,``.
- :param encoding: decodes the CSV files by the given encoding type.
- The default value is ``UTF-8``.
+ If None is set, it uses the default value, ``,``.
+ :param encoding: decodes the CSV files by the given encoding type. If None is set,
+ it uses the default value, ``UTF-8``.
:param quote: sets the single character used for escaping quoted values where the
- separator can be part of the value. The default value is ``"``.
+ separator can be part of the value. If None is set, it uses the default
+ value, ``"``.
:param escape: sets the single character used for escaping quotes inside an already
- quoted value. The default value is ``\``.
+ quoted value. If None is set, it uses the default value, ``\``.
:param comment: sets the single character used for skipping lines beginning with this
character. By default (None), it is disabled.
- :param header: uses the first line as names of columns. The default value is ``false``.
+ :param header: uses the first line as names of columns. If None is set, it uses the
+ default value, ``false``.
:param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values
- being read should be skipped. The default value is
- ``false``.
+ being read should be skipped. If None is set, it uses
+ the default value, ``false``.
:param ignoreTrailingWhiteSpace: defines whether or not trailing whitespaces from values
- being read should be skipped. The default value is
- ``false``.
- :param nullValue: sets the string representation of a null value. The default value is a
- empty string.
- :param nanValue: sets the string representation of a non-number value. The default value is
- ``NaN``.
- :param positiveInf: sets the string representation of a positive infinity value. The default
- value is ``Inf``.
- :param negativeInf: sets the string representation of a negative infinity value. The default
- value is ``Inf``.
+ being read should be skipped. If None is set, it uses
+ the default value, ``false``.
+ :param nullValue: sets the string representation of a null value. If None is set, it uses
+ the default value, empty string.
+ :param nanValue: sets the string representation of a non-number value. If None is set, it
+ uses the default value, ``NaN``.
+ :param positiveInf: sets the string representation of a positive infinity value. If None
+ is set, it uses the default value, ``Inf``.
+ :param negativeInf: sets the string representation of a negative infinity value. If None
+ is set, it uses the default value, ``Inf``.
:param dateFormat: sets the string that indicates a date format. Custom date formats
follow the formats at ``java.text.SimpleDateFormat``. This
applies to both date type and timestamp type. By default, it is None
which means trying to parse times and date by
``java.sql.Timestamp.valueOf()`` and ``java.sql.Date.valueOf()``.
- :param maxColumns: defines a hard limit of how many columns a record can have. The default
- value is ``20480``.
+ :param maxColumns: defines a hard limit of how many columns a record can have. If None is
+ set, it uses the default value, ``20480``.
:param maxCharsPerColumn: defines the maximum number of characters allowed for any given
- value being read. The default value is ``1000000``.
- :param mode: allows a mode for dealing with corrupt records during parsing. The default
- value is ``PERMISSIVE``.
+ value being read. If None is set, it uses the default value,
+ ``1000000``.
+ :param mode: allows a mode for dealing with corrupt records during parsing. If None is
+ set, it uses the default value, ``PERMISSIVE``.
* ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted record.
When a schema is set by user, it sets ``null`` for extra fields.
@@ -783,8 +785,8 @@ class DataFrameWriter(object):
self._jwrite.text(path)
@since(2.0)
- def csv(self, path, mode='error', compression=None, sep=',', quote=u'\"', escape='\\',
- header='false', nullValue='', escapeQuotes='true'):
+ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
+ header=None, nullValue=None, escapeQuotes=None):
"""Saves the content of the [[DataFrame]] in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
@@ -798,19 +800,20 @@ class DataFrameWriter(object):
:param compression: compression codec to use when saving to file. This can be one of the
known case-insensitive shorten names (none, bzip2, gzip, lz4,
snappy and deflate).
- :param sep: sets the single character as a separator for each field and value. The default
- value is ``,``.
+ :param sep: sets the single character as a separator for each field and value. If None is
+ set, it uses the default value, ``,``.
:param quote: sets the single character used for escaping quoted values where the
- separator can be part of the value. The default value is ``"``.
+ separator can be part of the value. If None is set, it uses the default
+ value, ``"``.
:param escape: sets the single character used for escaping quotes inside an already
- quoted value. The default value is ``\``
+ quoted value. If None is set, it uses the default value, ``\``
:param escapeQuotes: A flag indicating whether values containing quotes should always
be enclosed in quotes. If None is set, it uses the default value
``true``, escaping all values containing a quote character.
- :param header: writes the names of columns as the first line. The default value is
- ``false``.
- :param nullValue: sets the string representation of a null value. The default value is a
- empty string.
+ :param header: writes the names of columns as the first line. If None is set, it uses
+ the default value, ``false``.
+ :param nullValue: sets the string representation of a null value. If None is set, it uses
+ the default value, empty string.
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
@@ -828,7 +831,7 @@ class DataFrameWriter(object):
if nullValue is not None:
self.option("nullValue", nullValue)
if escapeQuotes is not None:
- self.option("escapeQuotes", escapeQuotes)
+ self.option("escapeQuotes", nullValue)
self._jwrite.csv(path)
@since(1.5)