aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/functions.py
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-11-05 21:47:33 -0700
committerFelix Cheung <felixcheung@apache.org>2016-11-05 21:47:33 -0700
commit15d392688456ad9f963417843c52a7b610f771d2 (patch)
tree4a969525408e41c1bc971c9b79ab20ad741a3962 /python/pyspark/sql/functions.py
parent9a87c313859a6557bbf7bca7239043cb77ea23be (diff)
downloadspark-15d392688456ad9f963417843c52a7b610f771d2.tar.gz
spark-15d392688456ad9f963417843c52a7b610f771d2.tar.bz2
spark-15d392688456ad9f963417843c52a7b610f771d2.zip
[MINOR][DOCUMENTATION] Fix some minor descriptions in functions consistently with expressions
## What changes were proposed in this pull request? This PR proposes to improve documentation and fix some descriptions equivalent to several minor fixes identified in https://github.com/apache/spark/pull/15677 Also, this suggests to change `Note:` and `NOTE:` to `.. note::` consistently with the others which marks up pretty. ## How was this patch tested? Jenkins tests and manually. For PySpark, `Note:` and `NOTE:` to `.. note::` make the document as below: **From** ![2016-11-04 6 53 35](https://cloud.githubusercontent.com/assets/6477701/20002648/42989922-a2c5-11e6-8a32-b73eda49e8c3.png) ![2016-11-04 6 53 45](https://cloud.githubusercontent.com/assets/6477701/20002650/429fb310-a2c5-11e6-926b-e030d7eb0185.png) ![2016-11-04 6 54 11](https://cloud.githubusercontent.com/assets/6477701/20002649/429d570a-a2c5-11e6-9e7e-44090f337e32.png) ![2016-11-04 6 53 51](https://cloud.githubusercontent.com/assets/6477701/20002647/4297fc74-a2c5-11e6-801a-b89fbcbfca44.png) ![2016-11-04 6 53 51](https://cloud.githubusercontent.com/assets/6477701/20002697/749f5780-a2c5-11e6-835f-022e1f2f82e3.png) **To** ![2016-11-04 7 03 48](https://cloud.githubusercontent.com/assets/6477701/20002659/4961b504-a2c5-11e6-9ee0-ef0751482f47.png) ![2016-11-04 7 04 03](https://cloud.githubusercontent.com/assets/6477701/20002660/49871d3a-a2c5-11e6-85ea-d9a5d11efeff.png) ![2016-11-04 7 04 28](https://cloud.githubusercontent.com/assets/6477701/20002662/498e0f14-a2c5-11e6-803d-c0c5aeda4153.png) ![2016-11-04 7 33 39](https://cloud.githubusercontent.com/assets/6477701/20002731/a76e30d2-a2c5-11e6-993b-0481b8342d6b.png) ![2016-11-04 7 33 39](https://cloud.githubusercontent.com/assets/6477701/20002731/a76e30d2-a2c5-11e6-993b-0481b8342d6b.png) Author: hyukjinkwon <gurwls223@gmail.com> Closes #15765 from HyukjinKwon/minor-function-doc.
Diffstat (limited to 'python/pyspark/sql/functions.py')
-rw-r--r--python/pyspark/sql/functions.py35
1 files changed, 20 insertions, 15 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 245357a4ba..46a092f16d 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -359,8 +359,8 @@ def grouping_id(*cols):
(grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
- Note: the list of columns should match with grouping columns exactly, or empty (means all the
- grouping columns).
+ .. note:: the list of columns should match with grouping columns exactly, or empty (means all
+ the grouping columns).
>>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
+-----+-------------+--------+
@@ -457,7 +457,8 @@ def nanvl(col1, col2):
@since(1.4)
def rand(seed=None):
- """Generates a random column with i.i.d. samples from U[0.0, 1.0].
+ """Generates a random column with independent and identically distributed (i.i.d.) samples
+ from U[0.0, 1.0].
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -469,7 +470,8 @@ def rand(seed=None):
@since(1.4)
def randn(seed=None):
- """Generates a column with i.i.d. samples from the standard normal distribution.
+ """Generates a column with independent and identically distributed (i.i.d.) samples from
+ the standard normal distribution.
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -518,7 +520,7 @@ def shiftLeft(col, numBits):
@since(1.5)
def shiftRight(col, numBits):
- """Shift the given value numBits right.
+ """(Signed) shift the given value numBits right.
>>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
[Row(r=21)]
@@ -777,8 +779,8 @@ def date_format(date, format):
A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
pattern letters of the Java class `java.text.SimpleDateFormat` can be used.
- NOTE: Use when ever possible specialized functions like `year`. These benefit from a
- specialized implementation.
+ .. note:: Use when ever possible specialized functions like `year`. These benefit from a
+ specialized implementation.
>>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
>>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
@@ -1059,7 +1061,8 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
@since(1.5)
def from_utc_timestamp(timestamp, tz):
"""
- Assumes given timestamp is UTC and converts to given timezone.
+ Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+ that corresponds to the same time of day in the given timezone.
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1072,7 +1075,8 @@ def from_utc_timestamp(timestamp, tz):
@since(1.5)
def to_utc_timestamp(timestamp, tz):
"""
- Assumes given timestamp is in given timezone and converts to UTC.
+ Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+ another timestamp that corresponds to the same time of day in UTC.
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1314,8 +1318,8 @@ def instr(str, substr):
Locate the position of the first occurrence of substr column in the given string.
Returns null if either of the arguments are null.
- NOTE: The position is not zero based, but 1 based index, returns 0 if substr
- could not be found in str.
+ .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+ could not be found in str.
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(instr(df.s, 'b').alias('s')).collect()
@@ -1379,8 +1383,8 @@ def locate(substr, str, pos=1):
"""
Locate the position of the first occurrence of substr in a string column, after position pos.
- NOTE: The position is not zero based, but 1 based index. returns 0 if substr
- could not be found in str.
+ .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+ could not be found in str.
:param substr: a string
:param str: a Column of :class:`pyspark.sql.types.StringType`
@@ -1442,7 +1446,7 @@ def split(str, pattern):
"""
Splits str around pattern (pattern is a regular expression).
- NOTE: pattern is a string represent the regular expression.
+ .. note:: pattern is a string represent the regular expression.
>>> df = spark.createDataFrame([('ab12cd',)], ['s',])
>>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
@@ -1785,7 +1789,8 @@ def size(col):
@since(1.5)
def sort_array(col, asc=True):
"""
- Collection function: sorts the input array for the given column in ascending order.
+ Collection function: sorts the input array in ascending or descending order according
+ to the natural ordering of the array elements.
:param col: name of column or expression