aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-06-11 23:20:40 -0700
committerReynold Xin <rxin@databricks.com>2016-06-11 23:20:40 -0700
commit9e204c62c6800e03759e04ef68268105d4b86bf2 (patch)
treecb218a5da7c64b4cbe4ff74e318171d35affb4ad /python
parente1f986c7a3fcc3864d53ef99ef7f14fa4d262ac3 (diff)
downloadspark-9e204c62c6800e03759e04ef68268105d4b86bf2.tar.gz
spark-9e204c62c6800e03759e04ef68268105d4b86bf2.tar.bz2
spark-9e204c62c6800e03759e04ef68268105d4b86bf2.zip
[SPARK-15840][SQL] Add two missing options in documentation and some option related changes
## What changes were proposed in this pull request? This PR 1. Adds the documentations for some missing options, `inferSchema` and `mergeSchema` for Python and Scala. 2. Fiixes `[[DataFrame]]` to ```:class:`DataFrame` ``` so that this can be shown - from ![2016-06-09 9 31 16](https://cloud.githubusercontent.com/assets/6477701/15929721/8b864734-2e89-11e6-83f6-207527de4ac9.png) - to (with class link) ![2016-06-09 9 31 00](https://cloud.githubusercontent.com/assets/6477701/15929717/8a03d728-2e89-11e6-8a3f-08294964db22.png) (Please refer [the latest documentation](https://people.apache.org/~pwendell/spark-nightly/spark-master-docs/latest/api/python/pyspark.sql.html)) 3. Moves `mergeSchema` option to `ParquetOptions` with removing unused options, `metastoreSchema` and `metastoreTableName`. They are not used anymore. They were removed in https://github.com/apache/spark/commit/e720dda42e806229ccfd970055c7b8a93eb447bf and there are no use cases as below: ```bash grep -r -e METASTORE_SCHEMA -e \"metastoreSchema\" -e \"metastoreTableName\" -e METASTORE_TABLE_NAME . ``` ``` ./sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala: private[sql] val METASTORE_SCHEMA = "metastoreSchema" ./sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala: private[sql] val METASTORE_TABLE_NAME = "metastoreTableName" ./sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala: ParquetFileFormat.METASTORE_TABLE_NAME -> TableIdentifier( ``` It only sets `metastoreTableName` in the last case but does not use the table name. 4. Sets the correct default values (in the documentation) for `compression` option for ORC(`snappy`, see [OrcOptions.scala#L33-L42](https://github.com/apache/spark/blob/3ded5bc4db2badc9ff49554e73421021d854306b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcOptions.scala#L33-L42)) and Parquet(`the value specified in SQLConf`, see [ParquetOptions.scala#L38-L47](https://github.com/apache/spark/blob/3ded5bc4db2badc9ff49554e73421021d854306b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala#L38-L47)) and `columnNameOfCorruptRecord` for JSON(`the value specified in SQLConf`, see [JsonFileFormat.scala#L53-L55](https://github.com/apache/spark/blob/4538443e276597530a27c6922e48503677b13956/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala#L53-L55) and [JsonFileFormat.scala#L105-L106](https://github.com/apache/spark/blob/4538443e276597530a27c6922e48503677b13956/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala#L105-L106)). ## How was this patch tested? Existing tests should cover this. Author: hyukjinkwon <gurwls223@gmail.com> Author: Hyukjin Kwon <gurwls223@gmail.com> Closes #13576 from HyukjinKwon/SPARK-15840.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/readwriter.py40
1 files changed, 27 insertions, 13 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 7d1f18611b..f3182b237e 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -209,7 +209,8 @@ class DataFrameReader(object):
:param columnNameOfCorruptRecord: allows renaming the new field having malformed string
created by ``PERMISSIVE`` mode. This overrides
``spark.sql.columnNameOfCorruptRecord``. If None is set,
- it uses the default value ``_corrupt_record``.
+ it uses the value specified in
+ ``spark.sql.columnNameOfCorruptRecord``.
>>> df1 = spark.read.json('python/test_support/sql/people.json')
>>> df1.dtypes
@@ -276,6 +277,11 @@ class DataFrameReader(object):
def parquet(self, *paths):
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
+ You can set the following Parquet-specific option(s) for reading Parquet files:
+ * ``mergeSchema``: sets whether we should merge schemas collected from all \
+ Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \
+ The default value is specified in ``spark.sql.parquet.mergeSchema``.
+
>>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
@@ -285,7 +291,7 @@ class DataFrameReader(object):
@ignore_unicode_prefix
@since(1.6)
def text(self, paths):
- """Loads a text file and returns a [[DataFrame]] with a single string column named "value".
+ """Loads a text file and returns a :class:`DataFrame` with a single string column named "value".
If the directory structure of the text files contains partitioning information,
those are ignored in the resulting DataFrame. To include partitioning information as
columns, use ``read.format('text').load(...)``.
@@ -304,13 +310,14 @@ class DataFrameReader(object):
@since(2.0)
def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None,
- comment=None, header=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
- nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None,
- maxColumns=None, maxCharsPerColumn=None, mode=None):
- """Loads a CSV file and returns the result as a [[DataFrame]].
+ comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,
+ ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
+ negativeInf=None, dateFormat=None, maxColumns=None, maxCharsPerColumn=None, mode=None):
+ """Loads a CSV file and returns the result as a :class:`DataFrame`.
- This function goes through the input once to determine the input schema. To avoid going
- through the entire data once, specify the schema explicitly using [[schema]].
+ This function will go through the input once to determine the input schema if
+ ``inferSchema`` is enabled. To avoid going through the entire data once, disable
+ ``inferSchema`` option or specify the schema explicitly using ``schema``.
:param path: string, or list of strings, for input path(s).
:param schema: an optional :class:`StructType` for the input schema.
@@ -328,6 +335,8 @@ class DataFrameReader(object):
character. By default (None), it is disabled.
:param header: uses the first line as names of columns. If None is set, it uses the
default value, ``false``.
+ :param inferSchema: infers the input schema automatically from data. It requires one extra
+ pass over the data. If None is set, it uses the default value, ``false``.
:param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values
being read should be skipped. If None is set, it uses
the default value, ``false``.
@@ -378,6 +387,8 @@ class DataFrameReader(object):
self.option("comment", comment)
if header is not None:
self.option("header", header)
+ if inferSchema is not None:
+ self.option("inferSchema", inferSchema)
if ignoreLeadingWhiteSpace is not None:
self.option("ignoreLeadingWhiteSpace", ignoreLeadingWhiteSpace)
if ignoreTrailingWhiteSpace is not None:
@@ -464,7 +475,7 @@ class DataFrameReader(object):
class DataFrameWriter(object):
"""
- Interface used to write a [[DataFrame]] to external storage systems
+ Interface used to write a :class:`DataFrame` to external storage systems
(e.g. file systems, key-value stores, etc). Use :func:`DataFrame.write`
to access this.
@@ -701,7 +712,7 @@ class DataFrameWriter(object):
In the case the table already exists, behavior of this function depends on the
save mode, specified by the `mode` function (default to throwing an exception).
- When `mode` is `Overwrite`, the schema of the [[DataFrame]] does not need to be
+ When `mode` is `Overwrite`, the schema of the :class:`DataFrame` does not need to be
the same as that of the existing table.
* `append`: Append contents of this :class:`DataFrame` to existing data.
@@ -758,7 +769,9 @@ class DataFrameWriter(object):
:param partitionBy: names of partitioning columns
:param compression: compression codec to use when saving to file. This can be one of the
known case-insensitive shorten names (none, snappy, gzip, and lzo).
- This will overwrite ``spark.sql.parquet.compression.codec``.
+ This will override ``spark.sql.parquet.compression.codec``. If None
+ is set, it uses the value specified in
+ ``spark.sql.parquet.compression.codec``.
>>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))
"""
@@ -788,7 +801,7 @@ class DataFrameWriter(object):
@since(2.0)
def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
header=None, nullValue=None, escapeQuotes=None):
- """Saves the content of the [[DataFrame]] in CSV format at the specified path.
+ """Saves the content of the :class:`DataFrame` in CSV format at the specified path.
:param path: the path in any Hadoop supported file system
:param mode: specifies the behavior of the save operation when data already exists.
@@ -852,7 +865,8 @@ class DataFrameWriter(object):
:param partitionBy: names of partitioning columns
:param compression: compression codec to use when saving to file. This can be one of the
known case-insensitive shorten names (none, snappy, zlib, and lzo).
- This will overwrite ``orc.compress``.
+ This will override ``orc.compress``. If None is set, it uses the
+ default value, ``snappy``.
>>> orc_df = spark.read.orc('python/test_support/sql/orc_partitioned')
>>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))