aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2017-03-23 00:25:01 -0700
committerFelix Cheung <felixcheung@apache.org>2017-03-23 00:25:01 -0700
commit07c12c09a75645f6b56b30654455b3838b7b6637 (patch)
tree7680418bff0d7885ea8bdefd0d3e182f751f3606 /python/pyspark/sql
parent12cd00706cbfff4c8ac681fcae65b4c4c8751877 (diff)
downloadspark-07c12c09a75645f6b56b30654455b3838b7b6637.tar.gz
spark-07c12c09a75645f6b56b30654455b3838b7b6637.tar.bz2
spark-07c12c09a75645f6b56b30654455b3838b7b6637.zip
[SPARK-18579][SQL] Use ignoreLeadingWhiteSpace and ignoreTrailingWhiteSpace options in CSV writing
## What changes were proposed in this pull request? This PR proposes to support _not_ trimming the white spaces when writing out. These are `false` by default in CSV reading path but these are `true` by default in CSV writing in univocity parser. Both `ignoreLeadingWhiteSpace` and `ignoreTrailingWhiteSpace` options are not being used for writing and therefore, we are always trimming the white spaces. It seems we should provide a way to keep this white spaces easily. WIth the data below: ```scala val df = spark.read.csv(Seq("a , b , c").toDS) df.show() ``` ``` +---+----+---+ |_c0| _c1|_c2| +---+----+---+ | a | b | c| +---+----+---+ ``` **Before** ```scala df.write.csv("/tmp/text.csv") spark.read.text("/tmp/text.csv").show() ``` ``` +-----+ |value| +-----+ |a,b,c| +-----+ ``` It seems this can't be worked around via `quoteAll` too. ```scala df.write.option("quoteAll", true).csv("/tmp/text.csv") spark.read.text("/tmp/text.csv").show() ``` ``` +-----------+ | value| +-----------+ |"a","b","c"| +-----------+ ``` **After** ```scala df.write.option("ignoreLeadingWhiteSpace", false).option("ignoreTrailingWhiteSpace", false).csv("/tmp/text.csv") spark.read.text("/tmp/text.csv").show() ``` ``` +----------+ | value| +----------+ |a , b , c| +----------+ ``` Note that this case is possible in R ```r > system("cat text.csv") f1,f2,f3 a , b , c > df <- read.csv(file="text.csv") > df f1 f2 f3 1 a b c > write.csv(df, file="text1.csv", quote=F, row.names=F) > system("cat text1.csv") f1,f2,f3 a , b , c ``` ## How was this patch tested? Unit tests in `CSVSuite` and manual tests for Python. Author: hyukjinkwon <gurwls223@gmail.com> Closes #17310 from HyukjinKwon/SPARK-18579.
Diffstat (limited to 'python/pyspark/sql')
-rw-r--r--python/pyspark/sql/readwriter.py28
-rw-r--r--python/pyspark/sql/streaming.py12
-rw-r--r--python/pyspark/sql/tests.py13
3 files changed, 37 insertions, 16 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 759c27507c..5e732b4bec 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -341,12 +341,12 @@ class DataFrameReader(OptionUtils):
default value, ``false``.
:param inferSchema: infers the input schema automatically from data. It requires one extra
pass over the data. If None is set, it uses the default value, ``false``.
- :param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values
- being read should be skipped. If None is set, it uses
- the default value, ``false``.
- :param ignoreTrailingWhiteSpace: defines whether or not trailing whitespaces from values
- being read should be skipped. If None is set, it uses
- the default value, ``false``.
+ :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
+ values being read should be skipped. If None is set, it
+ uses the default value, ``false``.
+ :param ignoreTrailingWhiteSpace: A flag indicating whether or not trailing whitespaces from
+ values being read should be skipped. If None is set, it
+ uses the default value, ``false``.
:param nullValue: sets the string representation of a null value. If None is set, it uses
the default value, empty string. Since 2.0.1, this ``nullValue`` param
applies to all supported types including the string type.
@@ -706,7 +706,7 @@ class DataFrameWriter(OptionUtils):
@since(2.0)
def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
- timestampFormat=None):
+ timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None):
"""Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
@@ -728,10 +728,10 @@ class DataFrameWriter(OptionUtils):
empty string.
:param escape: sets the single character used for escaping quotes inside an already
quoted value. If None is set, it uses the default value, ``\``
- :param escapeQuotes: A flag indicating whether values containing quotes should always
+ :param escapeQuotes: a flag indicating whether values containing quotes should always
be enclosed in quotes. If None is set, it uses the default value
``true``, escaping all values containing a quote character.
- :param quoteAll: A flag indicating whether all values should always be enclosed in
+ :param quoteAll: a flag indicating whether all values should always be enclosed in
quotes. If None is set, it uses the default value ``false``,
only escaping values containing a quote character.
:param header: writes the names of columns as the first line. If None is set, it uses
@@ -746,13 +746,21 @@ class DataFrameWriter(OptionUtils):
formats follow the formats at ``java.text.SimpleDateFormat``.
This applies to timestamp type. If None is set, it uses the
default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSZZ``.
+ :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from
+ values being written should be skipped. If None is set, it
+ uses the default value, ``true``.
+ :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from
+ values being written should be skipped. If None is set, it
+ uses the default value, ``true``.
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
"""
self.mode(mode)
self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header,
nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll,
- dateFormat=dateFormat, timestampFormat=timestampFormat)
+ dateFormat=dateFormat, timestampFormat=timestampFormat,
+ ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace,
+ ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace)
self._jwrite.csv(path)
@since(1.5)
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index e227f9ceb5..80f4340cdf 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -597,12 +597,12 @@ class DataStreamReader(OptionUtils):
default value, ``false``.
:param inferSchema: infers the input schema automatically from data. It requires one extra
pass over the data. If None is set, it uses the default value, ``false``.
- :param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values
- being read should be skipped. If None is set, it uses
- the default value, ``false``.
- :param ignoreTrailingWhiteSpace: defines whether or not trailing whitespaces from values
- being read should be skipped. If None is set, it uses
- the default value, ``false``.
+ :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from
+ values being read should be skipped. If None is set, it
+ uses the default value, ``false``.
+ :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from
+ values being read should be skipped. If None is set, it
+ uses the default value, ``false``.
:param nullValue: sets the string representation of a null value. If None is set, it uses
the default value, empty string. Since 2.0.1, this ``nullValue`` param
applies to all supported types including the string type.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index f0a9a0400e..29d613bc5f 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -450,6 +450,19 @@ class SQLTests(ReusedPySparkTestCase):
Row(_c0=u'Hyukjin', _c1=u'25', _c2=u'I am Hyukjin\n\nI love Spark!')]
self.assertEqual(ages_newlines.collect(), expected)
+ def test_ignorewhitespace_csv(self):
+ tmpPath = tempfile.mkdtemp()
+ shutil.rmtree(tmpPath)
+ self.spark.createDataFrame([[" a", "b ", " c "]]).write.csv(
+ tmpPath,
+ ignoreLeadingWhiteSpace=False,
+ ignoreTrailingWhiteSpace=False)
+
+ expected = [Row(value=u' a,b , c ')]
+ readback = self.spark.read.text(tmpPath)
+ self.assertEqual(readback.collect(), expected)
+ shutil.rmtree(tmpPath)
+
def test_read_multiple_orc_file(self):
df = self.spark.read.orc(["python/test_support/sql/orc_partitioned/b=0/c=0",
"python/test_support/sql/orc_partitioned/b=1/c=1"])