From 274f3b9ec86e4109c7678eef60f990d41dc3899f Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Thu, 28 Jul 2016 14:57:15 -0700
Subject: [SPARK-16772] Correct API doc references to PySpark classes +
 formatting fixes

## What's Been Changed

The PR corrects several broken or missing class references in the Python API docs. It also correct formatting problems.

For example, you can see [here](http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext.registerFunction) how Sphinx is not picking up the reference to `DataType`. That's because the reference is relative to the current module, whereas `DataType` is in a different module.

You can also see [here](http://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html#pyspark.sql.SQLContext.createDataFrame) how the formatting for byte, tinyint, and so on is italic instead of monospace. That's because in ReST single backticks just make things italic, unlike in Markdown.

## Testing

I tested this PR by [building the Python docs](https://github.com/apache/spark/tree/master/docs#generating-the-documentation-html) and reviewing the results locally in my browser. I confirmed that the broken or missing class references were resolved, and that the formatting was corrected.

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #14393 from nchammas/python-docstring-fixes.
---
 python/pyspark/sql/functions.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'python/pyspark/sql/functions.py')

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 92d709ee40..e422363ec1 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -142,7 +142,7 @@ _functions_1_6 = {
 _binary_mathfunctions = {
     'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' +
              'polar coordinates (r, theta).',
-    'hypot': 'Computes `sqrt(a^2 + b^2)` without intermediate overflow or underflow.',
+    'hypot': 'Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.',
     'pow': 'Returns the value of the first argument raised to the power of the second argument.',
 }
 
@@ -958,7 +958,8 @@ def months_between(date1, date2):
 @since(1.5)
 def to_date(col):
     """
-    Converts the column of StringType or TimestampType into DateType.
+    Converts the column of :class:`pyspark.sql.types.StringType` or
+    :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType`.
 
     >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
     >>> df.select(to_date(df.t).alias('date')).collect()
@@ -1074,18 +1075,18 @@ def window(timeColumn, windowDuration, slideDuration=None, startTime=None):
     [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in
     the order of months are not supported.
 
-    The time column must be of TimestampType.
+    The time column must be of :class:`pyspark.sql.types.TimestampType`.
 
     Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid
     interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.
-    If the `slideDuration` is not provided, the windows will be tumbling windows.
+    If the ``slideDuration`` is not provided, the windows will be tumbling windows.
 
     The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start
     window intervals. For example, in order to have hourly tumbling windows that start 15 minutes
     past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.
 
     The output column will be a struct called 'window' by default with the nested columns 'start'
-    and 'end', where 'start' and 'end' will be of `TimestampType`.
+    and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`.
 
     >>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val")
     >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))
@@ -1367,7 +1368,7 @@ def locate(substr, str, pos=1):
     could not be found in str.
 
     :param substr: a string
-    :param str: a Column of StringType
+    :param str: a Column of :class:`pyspark.sql.types.StringType`
     :param pos: start position (zero based)
 
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
@@ -1506,8 +1507,9 @@ def bin(col):
 @ignore_unicode_prefix
 @since(1.5)
 def hex(col):
-    """Computes hex value of the given column, which could be StringType,
-    BinaryType, IntegerType or LongType.
+    """Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,
+    :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or
+    :class:`pyspark.sql.types.LongType`.
 
     >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()
     [Row(hex(a)=u'414243', hex(b)=u'3')]
@@ -1781,6 +1783,9 @@ def udf(f, returnType=StringType()):
     duplicate invocations may be eliminated or the function may even be invoked more times than
     it is present in the query.
 
+    :param f: python function
+    :param returnType: a :class:`pyspark.sql.types.DataType` object
+
     >>> from pyspark.sql.types import IntegerType
     >>> slen = udf(lambda s: len(s), IntegerType())
     >>> df.select(slen(df.name).alias('slen')).collect()
-- 
cgit v1.2.3