diff options
author | Nicholas Chammas <nicholas.chammas@gmail.com> | 2016-07-28 14:57:15 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2016-07-28 14:57:15 -0700 |
commit | 274f3b9ec86e4109c7678eef60f990d41dc3899f (patch) | |
tree | 2394c6f1ff3e51bd9ea6bd2b365c1e7068c61295 /python/pyspark/sql/session.py | |
parent | 3fd39b87bda77f3c3a4622d854f23d4234683571 (diff) | |
download | spark-274f3b9ec86e4109c7678eef60f990d41dc3899f.tar.gz spark-274f3b9ec86e4109c7678eef60f990d41dc3899f.tar.bz2 spark-274f3b9ec86e4109c7678eef60f990d41dc3899f.zip |
[SPARK-16772] Correct API doc references to PySpark classes + formatting fixes
## What's Been Changed
The PR corrects several broken or missing class references in the Python API docs. It also correct formatting problems.
For example, you can see [here](http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext.registerFunction) how Sphinx is not picking up the reference to `DataType`. That's because the reference is relative to the current module, whereas `DataType` is in a different module.
You can also see [here](http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext.createDataFrame) how the formatting for byte, tinyint, and so on is italic instead of monospace. That's because in ReST single backticks just make things italic, unlike in Markdown.
## Testing
I tested this PR by [building the Python docs](https://github.com/apache/spark/tree/master/docs#generating-the-documentation-html) and reviewing the results locally in my browser. I confirmed that the broken or missing class references were resolved, and that the formatting was corrected.
Author: Nicholas Chammas <nicholas.chammas@gmail.com>
Closes #14393 from nchammas/python-docstring-fixes.
Diffstat (limited to 'python/pyspark/sql/session.py')
-rw-r--r-- | python/pyspark/sql/session.py | 41 |
1 files changed, 23 insertions, 18 deletions
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 594f9375f7..10bd89b03f 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -47,7 +47,7 @@ def _monkey_patch_RDD(sparkSession): This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)`` - :param schema: a StructType or list of names of columns + :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns :param samplingRatio: the sample ratio of rows used for inferring :return: a DataFrame @@ -274,9 +274,9 @@ class SparkSession(object): @since(2.0) def range(self, start, end=None, step=1, numPartitions=None): """ - Create a :class:`DataFrame` with single LongType column named `id`, - containing elements in a range from `start` to `end` (exclusive) with - step value `step`. + Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named + ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with + step value ``step``. :param start: the start value :param end: the end value (exclusive) @@ -307,7 +307,7 @@ class SparkSession(object): Infer schema from list of Row or tuple. :param data: list of Row or tuple - :return: StructType + :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") @@ -326,7 +326,7 @@ class SparkSession(object): :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) - :return: StructType + :return: :class:`pyspark.sql.types.StructType` """ first = rdd.first() if not first: @@ -414,28 +414,33 @@ class SparkSession(object): from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. - When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or - exception will be thrown at runtime. If the given schema is not StructType, it will be - wrapped into a StructType as its only field, and the field name will be "value", each record - will also be wrapped into a tuple, which can be converted to row later. + When ``schema`` is :class:`pyspark.sql.types.DataType` or + :class:`pyspark.sql.types.StringType`, it must match the + real data, or an exception will be thrown at runtime. If the given schema is not + :class:`pyspark.sql.types.StructType`, it will be wrapped into a + :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", + each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. - :param schema: a :class:`DataType` or a datatype string or a list of column names, default - is None. The data type string format equals to `DataType.simpleString`, except that - top level struct type can omit the `struct<>` and atomic types use `typeName()` as - their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int` - as a short name for IntegerType. + :param schema: a :class:`pyspark.sql.types.DataType` or a + :class:`pyspark.sql.types.StringType` or a list of + column names, default is ``None``. The data type string format equals to + :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can + omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use + ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use + ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :return: :class:`DataFrame` .. versionchanged:: 2.0 - The schema parameter can be a DataType or a datatype string after 2.0. If it's not a - StructType, it will be wrapped into a StructType and each record will also be wrapped - into a tuple. + The ``schema`` parameter can be a :class:`pyspark.sql.types.DataType` or a + :class:`pyspark.sql.types.StringType` after 2.0. If it's not a + :class:`pyspark.sql.types.StructType`, it will be wrapped into a + :class:`pyspark.sql.types.StructType` and each record will also be wrapped into a tuple. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() |