aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-05-01 19:05:20 -0700
committerReynold Xin <rxin@databricks.com>2016-05-01 19:05:20 -0700
commita832cef11233c6357c7ba7ede387b432e6b0ed71 (patch)
tree6496cc0664a4df9c32b9f15a3c7940250dec1f7d /python
parenta6428292f78fd594f41a4a7bf254d40268f46305 (diff)
downloadspark-a832cef11233c6357c7ba7ede387b432e6b0ed71.tar.gz
spark-a832cef11233c6357c7ba7ede387b432e6b0ed71.tar.bz2
spark-a832cef11233c6357c7ba7ede387b432e6b0ed71.zip
[SPARK-13425][SQL] Documentation for CSV datasource options
## What changes were proposed in this pull request? This PR adds the explanation and documentation for CSV options for reading and writing. ## How was this patch tested? Style tests with `./dev/run_tests` for documentation style. Author: hyukjinkwon <gurwls223@gmail.com> Author: Hyukjin Kwon <gurwls223@gmail.com> Closes #12817 from HyukjinKwon/SPARK-13425.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/readwriter.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index ed9e716ab7..cc5e93dcad 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -282,6 +282,45 @@ class DataFrameReader(object):
:param paths: string, or list of strings, for input path(s).
+ You can set the following CSV-specific options to deal with CSV files:
+ * ``sep`` (default ``,``): sets the single character as a separator \
+ for each field and value.
+ * ``charset`` (default ``UTF-8``): decodes the CSV files by the given \
+ encoding type.
+ * ``quote`` (default ``"``): sets the single character used for escaping \
+ quoted values where the separator can be part of the value.
+ * ``escape`` (default ``\``): sets the single character used for escaping quotes \
+ inside an already quoted value.
+ * ``comment`` (default empty string): sets the single character used for skipping \
+ lines beginning with this character. By default, it is disabled.
+ * ``header`` (default ``false``): uses the first line as names of columns.
+ * ``ignoreLeadingWhiteSpace`` (default ``false``): defines whether or not leading \
+ whitespaces from values being read should be skipped.
+ * ``ignoreTrailingWhiteSpace`` (default ``false``): defines whether or not trailing \
+ whitespaces from values being read should be skipped.
+ * ``nullValue`` (default empty string): sets the string representation of a null value.
+ * ``nanValue`` (default ``NaN``): sets the string representation of a non-number \
+ value.
+ * ``positiveInf`` (default ``Inf``): sets the string representation of a positive \
+ infinity value.
+ * ``negativeInf`` (default ``-Inf``): sets the string representation of a negative \
+ infinity value.
+ * ``dateFormat`` (default ``None``): sets the string that indicates a date format. \
+ Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This \
+ applies to both date type and timestamp type. By default, it is None which means \
+ trying to parse times and date by ``java.sql.Timestamp.valueOf()`` and \
+ ``java.sql.Date.valueOf()``.
+ * ``maxColumns`` (default ``20480``): defines a hard limit of how many columns \
+ a record can have.
+ * ``maxCharsPerColumn`` (default ``1000000``): defines the maximum number of \
+ characters allowed for any given value being read.
+ * ``mode`` (default ``PERMISSIVE``): allows a mode for dealing with corrupt records \
+ during parsing.
+ * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted record. \
+ When a schema is set by user, it sets ``null`` for extra fields.
+ * ``DROPMALFORMED`` : ignores the whole corrupted records.
+ * ``FAILFAST`` : throws an exception when it meets corrupted records.
+
>>> df = sqlContext.read.csv('python/test_support/sql/ages.csv')
>>> df.dtypes
[('C0', 'string'), ('C1', 'string')]
@@ -663,6 +702,19 @@ class DataFrameWriter(object):
known case-insensitive shorten names (none, bzip2, gzip, lz4,
snappy and deflate).
+ You can set the following CSV-specific options to deal with CSV files:
+ * ``sep`` (default ``,``): sets the single character as a separator \
+ for each field and value.
+ * ``quote`` (default ``"``): sets the single character used for escaping \
+ quoted values where the separator can be part of the value.
+ * ``escape`` (default ``\``): sets the single character used for escaping quotes \
+ inside an already quoted value.
+ * ``header`` (default ``false``): writes the names of columns as the first line.
+ * ``nullValue`` (default empty string): sets the string representation of a null value.
+ * ``compression``: compression codec to use when saving to file. This can be one of \
+ the known case-insensitive shorten names (none, bzip2, gzip, lz4, snappy and \
+ deflate).
+
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)