From 274f3b9ec86e4109c7678eef60f990d41dc3899f Mon Sep 17 00:00:00 2001 From: Nicholas Chammas Date: Thu, 28 Jul 2016 14:57:15 -0700 Subject: [SPARK-16772] Correct API doc references to PySpark classes + formatting fixes ## What's Been Changed The PR corrects several broken or missing class references in the Python API docs. It also correct formatting problems. For example, you can see [here](http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext.registerFunction) how Sphinx is not picking up the reference to `DataType`. That's because the reference is relative to the current module, whereas `DataType` is in a different module. You can also see [here](http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext.createDataFrame) how the formatting for byte, tinyint, and so on is italic instead of monospace. That's because in ReST single backticks just make things italic, unlike in Markdown. ## Testing I tested this PR by [building the Python docs](https://github.com/apache/spark/tree/master/docs#generating-the-documentation-html) and reviewing the results locally in my browser. I confirmed that the broken or missing class references were resolved, and that the formatting was corrected. Author: Nicholas Chammas Closes #14393 from nchammas/python-docstring-fixes. --- python/pyspark/sql/catalog.py | 2 +- python/pyspark/sql/context.py | 44 +++++++++++++++++++++++----------------- python/pyspark/sql/dataframe.py | 2 +- python/pyspark/sql/functions.py | 21 +++++++++++-------- python/pyspark/sql/readwriter.py | 8 ++++---- python/pyspark/sql/session.py | 41 +++++++++++++++++++++---------------- python/pyspark/sql/streaming.py | 8 ++++---- python/pyspark/sql/types.py | 7 ++++--- 8 files changed, 75 insertions(+), 58 deletions(-) (limited to 'python') diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py index 4af930a3cd..3c5030722f 100644 --- a/python/pyspark/sql/catalog.py +++ b/python/pyspark/sql/catalog.py @@ -193,7 +193,7 @@ class Catalog(object): :param name: name of the UDF :param f: python function - :param returnType: a :class:`DataType` object + :param returnType: a :class:`pyspark.sql.types.DataType` object >>> spark.catalog.registerFunction("stringLengthString", lambda x: len(x)) >>> spark.sql("SELECT stringLengthString('test')").collect() diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 0debcf1de4..f7009fe589 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -152,9 +152,9 @@ class SQLContext(object): @since(1.4) def range(self, start, end=None, step=1, numPartitions=None): """ - Create a :class:`DataFrame` with single LongType column named `id`, - containing elements in a range from `start` to `end` (exclusive) with - step value `step`. + Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named + ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with + step value ``step``. :param start: the start value :param end: the end value (exclusive) @@ -184,7 +184,7 @@ class SQLContext(object): :param name: name of the UDF :param f: python function - :param returnType: a :class:`DataType` object + :param returnType: a :class:`pyspark.sql.types.DataType` object >>> sqlContext.registerFunction("stringLengthString", lambda x: len(x)) >>> sqlContext.sql("SELECT stringLengthString('test')").collect() @@ -209,7 +209,7 @@ class SQLContext(object): :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) - :return: StructType + :return: :class:`pyspark.sql.types.StructType` """ return self.sparkSession._inferSchema(rdd, samplingRatio) @@ -226,28 +226,34 @@ class SQLContext(object): from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. - When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or - exception will be thrown at runtime. If the given schema is not StructType, it will be - wrapped into a StructType as its only field, and the field name will be "value", each record - will also be wrapped into a tuple, which can be converted to row later. + When ``schema`` is :class:`pyspark.sql.types.DataType` or + :class:`pyspark.sql.types.StringType`, it must match the + real data, or an exception will be thrown at runtime. If the given schema is not + :class:`pyspark.sql.types.StructType`, it will be wrapped into a + :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", + each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. - :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, - etc.), or :class:`list`, or :class:`pandas.DataFrame`. - :param schema: a :class:`DataType` or a datatype string or a list of column names, default - is None. The data type string format equals to `DataType.simpleString`, except that - top level struct type can omit the `struct<>` and atomic types use `typeName()` as - their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int` - as a short name for IntegerType. + :param data: an RDD of any kind of SQL data representation(e.g. :class:`Row`, + :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, or + :class:`pandas.DataFrame`. + :param schema: a :class:`pyspark.sql.types.DataType` or a + :class:`pyspark.sql.types.StringType` or a list of + column names, default is None. The data type string format equals to + :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can + omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use + ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. + We can also use ``int`` as a short name for :class:`pyspark.sql.types.IntegerType`. :param samplingRatio: the sample ratio of rows used for inferring :return: :class:`DataFrame` .. versionchanged:: 2.0 - The schema parameter can be a DataType or a datatype string after 2.0. If it's not a - StructType, it will be wrapped into a StructType and each record will also be wrapped - into a tuple. + The ``schema`` parameter can be a :class:`pyspark.sql.types.DataType` or a + :class:`pyspark.sql.types.StringType` after 2.0. + If it's not a :class:`pyspark.sql.types.StructType`, it will be wrapped into a + :class:`pyspark.sql.types.StructType` and each record will also be wrapped into a tuple. >>> l = [('Alice', 1)] >>> sqlContext.createDataFrame(l).collect() diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 0cbb3adfa8..a986092f5d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -196,7 +196,7 @@ class DataFrame(object): @property @since(1.3) def schema(self): - """Returns the schema of this :class:`DataFrame` as a :class:`types.StructType`. + """Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`. >>> df.schema StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 92d709ee40..e422363ec1 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -142,7 +142,7 @@ _functions_1_6 = { _binary_mathfunctions = { 'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' + 'polar coordinates (r, theta).', - 'hypot': 'Computes `sqrt(a^2 + b^2)` without intermediate overflow or underflow.', + 'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.', 'pow': 'Returns the value of the first argument raised to the power of the second argument.', } @@ -958,7 +958,8 @@ def months_between(date1, date2): @since(1.5) def to_date(col): """ - Converts the column of StringType or TimestampType into DateType. + Converts the column of :class:`pyspark.sql.types.StringType` or + :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`. >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_date(df.t).alias('date')).collect() @@ -1074,18 +1075,18 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None): [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in the order of months are not supported. - The time column must be of TimestampType. + The time column must be of :class:`pyspark.sql.types.TimestampType`. Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. - If the `slideDuration` is not provided, the windows will be tumbling windows. + If the ``slideDuration`` is not provided, the windows will be tumbling windows. The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`. The output column will be a struct called 'window' by default with the nested columns 'start' - and 'end', where 'start' and 'end' will be of `TimestampType`. + and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`. >>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val") >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum")) @@ -1367,7 +1368,7 @@ def locate(substr, str, pos=1): could not be found in str. :param substr: a string - :param str: a Column of StringType + :param str: a Column of :class:`pyspark.sql.types.StringType` :param pos: start position (zero based) >>> df = spark.createDataFrame([('abcd',)], ['s',]) @@ -1506,8 +1507,9 @@ def bin(col): @ignore_unicode_prefix @since(1.5) def hex(col): - """Computes hex value of the given column, which could be StringType, - BinaryType, IntegerType or LongType. + """Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`, + :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or + :class:`pyspark.sql.types.LongType`. >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect() [Row(hex(a)=u'414243', hex(b)=u'3')] @@ -1781,6 +1783,9 @@ def udf(f, returnType=StringType()): duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query. + :param f: python function + :param returnType: a :class:`pyspark.sql.types.DataType` object + >>> from pyspark.sql.types import IntegerType >>> slen = udf(lambda s: len(s), IntegerType()) >>> df.select(slen(df.name).alias('slen')).collect() diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index f7c354f513..4020bb3fa4 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -96,7 +96,7 @@ class DataFrameReader(OptionUtils): By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading. - :param schema: a StructType object + :param schema: a :class:`pyspark.sql.types.StructType` object """ if not isinstance(schema, StructType): raise TypeError("schema should be StructType") @@ -125,7 +125,7 @@ class DataFrameReader(OptionUtils): :param path: optional string or a list of string for file-system backed data sources. :param format: optional string for format of the data source. Default to 'parquet'. - :param schema: optional :class:`StructType` for the input schema. + :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema. :param options: all other string options >>> df = spark.read.load('python/test_support/sql/parquet_partitioned', opt1=True, @@ -166,7 +166,7 @@ class DataFrameReader(OptionUtils): :param path: string represents path to the JSON dataset, or RDD of Strings storing JSON objects. - :param schema: an optional :class:`StructType` for the input schema. + :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema. :param primitivesAsString: infers all primitive values as a string type. If None is set, it uses the default value, ``false``. :param prefersDecimal: infers all floating-point values as a decimal type. If the values @@ -294,7 +294,7 @@ class DataFrameReader(OptionUtils): ``inferSchema`` option or specify the schema explicitly using ``schema``. :param path: string, or list of strings, for input path(s). - :param schema: an optional :class:`StructType` for the input schema. + :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema. :param sep: sets the single character as a separator for each field and value. If None is set, it uses the default value, ``,``. :param encoding: decodes the CSV files by the given encoding type. If None is set, diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py index 594f9375f7..10bd89b03f 100644 --- a/python/pyspark/sql/session.py +++ b/python/pyspark/sql/session.py @@ -47,7 +47,7 @@ def _monkey_patch_RDD(sparkSession): This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)`` - :param schema: a StructType or list of names of columns + :param schema: a :class:`pyspark.sql.types.StructType` or list of names of columns :param samplingRatio: the sample ratio of rows used for inferring :return: a DataFrame @@ -274,9 +274,9 @@ class SparkSession(object): @since(2.0) def range(self, start, end=None, step=1, numPartitions=None): """ - Create a :class:`DataFrame` with single LongType column named `id`, - containing elements in a range from `start` to `end` (exclusive) with - step value `step`. + Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named + ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with + step value ``step``. :param start: the start value :param end: the end value (exclusive) @@ -307,7 +307,7 @@ class SparkSession(object): Infer schema from list of Row or tuple. :param data: list of Row or tuple - :return: StructType + :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can not infer schema from empty dataset") @@ -326,7 +326,7 @@ class SparkSession(object): :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) - :return: StructType + :return: :class:`pyspark.sql.types.StructType` """ first = rdd.first() if not first: @@ -414,28 +414,33 @@ class SparkSession(object): from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. - When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or - exception will be thrown at runtime. If the given schema is not StructType, it will be - wrapped into a StructType as its only field, and the field name will be "value", each record - will also be wrapped into a tuple, which can be converted to row later. + When ``schema`` is :class:`pyspark.sql.types.DataType` or + :class:`pyspark.sql.types.StringType`, it must match the + real data, or an exception will be thrown at runtime. If the given schema is not + :class:`pyspark.sql.types.StructType`, it will be wrapped into a + :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", + each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. - :param schema: a :class:`DataType` or a datatype string or a list of column names, default - is None. The data type string format equals to `DataType.simpleString`, except that - top level struct type can omit the `struct<>` and atomic types use `typeName()` as - their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int` - as a short name for IntegerType. + :param schema: a :class:`pyspark.sql.types.DataType` or a + :class:`pyspark.sql.types.StringType` or a list of + column names, default is ``None``. The data type string format equals to + :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can + omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use + ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use + ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :return: :class:`DataFrame` .. versionchanged:: 2.0 - The schema parameter can be a DataType or a datatype string after 2.0. If it's not a - StructType, it will be wrapped into a StructType and each record will also be wrapped - into a tuple. + The ``schema`` parameter can be a :class:`pyspark.sql.types.DataType` or a + :class:`pyspark.sql.types.StringType` after 2.0. If it's not a + :class:`pyspark.sql.types.StructType`, it will be wrapped into a + :class:`pyspark.sql.types.StructType` and each record will also be wrapped into a tuple. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 8bac347e13..a364555003 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -269,7 +269,7 @@ class DataStreamReader(OptionUtils): .. note:: Experimental. - :param schema: a StructType object + :param schema: a :class:`pyspark.sql.types.StructType` object >>> s = spark.readStream.schema(sdf_schema) """ @@ -310,7 +310,7 @@ class DataStreamReader(OptionUtils): :param path: optional string for file-system backed data sources. :param format: optional string for format of the data source. Default to 'parquet'. - :param schema: optional :class:`StructType` for the input schema. + :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema. :param options: all other string options >>> json_sdf = spark.readStream.format("json")\ @@ -349,7 +349,7 @@ class DataStreamReader(OptionUtils): :param path: string represents path to the JSON dataset, or RDD of Strings storing JSON objects. - :param schema: an optional :class:`StructType` for the input schema. + :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema. :param primitivesAsString: infers all primitive values as a string type. If None is set, it uses the default value, ``false``. :param prefersDecimal: infers all floating-point values as a decimal type. If the values @@ -461,7 +461,7 @@ class DataStreamReader(OptionUtils): .. note:: Experimental. :param path: string, or list of strings, for input path(s). - :param schema: an optional :class:`StructType` for the input schema. + :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema. :param sep: sets the single character as a separator for each field and value. If None is set, it uses the default value, ``,``. :param encoding: decodes the CSV files by the given encoding type. If None is set, diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index eea80684e2..1ca4bbc379 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -786,9 +786,10 @@ def _parse_struct_fields_string(s): def _parse_datatype_string(s): """ Parses the given data type string to a :class:`DataType`. The data type string format equals - to `DataType.simpleString`, except that top level struct type can omit the `struct<>` and - atomic types use `typeName()` as their format, e.g. use `byte` instead of `tinyint` for - ByteType. We can also use `int` as a short name for IntegerType. + to :class:`DataType.simpleString`, except that top level struct type can omit + the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead + of ``tinyint`` for :class:`ByteType`. We can also use ``int`` as a short name + for :class:`IntegerType`. >>> _parse_datatype_string("int ") IntegerType -- cgit v1.2.3