aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--R/pkg/R/functions.R22
-rw-r--r--python/pyspark/sql/functions.py35
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/functions.scala30
3 files changed, 51 insertions, 36 deletions
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9a545f0647..f8a9d3ce5d 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2317,7 +2317,8 @@ setMethod("date_format", signature(y = "Column", x = "character"),
#' from_utc_timestamp
#'
-#' Assumes given timestamp is UTC and converts to given timezone.
+#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+#' that corresponds to the same time of day in the given timezone.
#'
#' @param y Column to compute on.
#' @param x time zone to use.
@@ -2340,7 +2341,7 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
#' Locate the position of the first occurrence of substr column in the given string.
#' Returns null if either of the arguments are null.
#'
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
#' could not be found in str.
#'
#' @param y column to check
@@ -2391,7 +2392,8 @@ setMethod("next_day", signature(y = "Column", x = "character"),
#' to_utc_timestamp
#'
-#' Assumes given timestamp is in given timezone and converts to UTC.
+#' Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+#' another timestamp that corresponds to the same time of day in UTC.
#'
#' @param y Column to compute on
#' @param x timezone to use
@@ -2539,7 +2541,7 @@ setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
#' shiftRight
#'
-#' Shift the given value numBits right. If the given value is a long value, it will return
+#' (Signed) shift the given value numBits right. If the given value is a long value, it will return
#' a long value else it will return an integer value.
#'
#' @param y column to compute on.
@@ -2777,7 +2779,7 @@ setMethod("window", signature(x = "Column"),
#' locate
#'
#' Locate the position of the first occurrence of substr.
-#' NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+#' NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
#' could not be found in str.
#'
#' @param substr a character string to be matched.
@@ -2823,7 +2825,8 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
#' rand
#'
-#' Generate a random column with i.i.d. samples from U[0.0, 1.0].
+#' Generate a random column with independent and identically distributed (i.i.d.) samples
+#' from U[0.0, 1.0].
#'
#' @param seed a random seed. Can be missing.
#' @family normal_funcs
@@ -2852,7 +2855,8 @@ setMethod("rand", signature(seed = "numeric"),
#' randn
#'
-#' Generate a column with i.i.d. samples from the standard normal distribution.
+#' Generate a column with independent and identically distributed (i.i.d.) samples from
+#' the standard normal distribution.
#'
#' @param seed a random seed. Can be missing.
#' @family normal_funcs
@@ -3442,8 +3446,8 @@ setMethod("size",
#' sort_array
#'
-#' Sorts the input array for the given column in ascending order,
-#' according to the natural ordering of the array elements.
+#' Sorts the input array in ascending or descending order according
+#' to the natural ordering of the array elements.
#'
#' @param x A Column to sort
#' @param asc A logical flag indicating the sorting order.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 245357a4ba..46a092f16d 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -359,8 +359,8 @@ def grouping_id(*cols):
(grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)
- Note: the list of columns should match with grouping columns exactly, or empty (means all the
- grouping columns).
+ .. note:: the list of columns should match with grouping columns exactly, or empty (means all
+ the grouping columns).
>>> df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()
+-----+-------------+--------+
@@ -457,7 +457,8 @@ def nanvl(col1, col2):
@since(1.4)
def rand(seed=None):
- """Generates a random column with i.i.d. samples from U[0.0, 1.0].
+ """Generates a random column with independent and identically distributed (i.i.d.) samples
+ from U[0.0, 1.0].
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -469,7 +470,8 @@ def rand(seed=None):
@since(1.4)
def randn(seed=None):
- """Generates a column with i.i.d. samples from the standard normal distribution.
+ """Generates a column with independent and identically distributed (i.i.d.) samples from
+ the standard normal distribution.
"""
sc = SparkContext._active_spark_context
if seed is not None:
@@ -518,7 +520,7 @@ def shiftLeft(col, numBits):
@since(1.5)
def shiftRight(col, numBits):
- """Shift the given value numBits right.
+ """(Signed) shift the given value numBits right.
>>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()
[Row(r=21)]
@@ -777,8 +779,8 @@ def date_format(date, format):
A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
pattern letters of the Java class `java.text.SimpleDateFormat` can be used.
- NOTE: Use when ever possible specialized functions like `year`. These benefit from a
- specialized implementation.
+ .. note:: Use when ever possible specialized functions like `year`. These benefit from a
+ specialized implementation.
>>> df = spark.createDataFrame([('2015-04-08',)], ['a'])
>>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
@@ -1059,7 +1061,8 @@ def unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):
@since(1.5)
def from_utc_timestamp(timestamp, tz):
"""
- Assumes given timestamp is UTC and converts to given timezone.
+ Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+ that corresponds to the same time of day in the given timezone.
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(from_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1072,7 +1075,8 @@ def from_utc_timestamp(timestamp, tz):
@since(1.5)
def to_utc_timestamp(timestamp, tz):
"""
- Assumes given timestamp is in given timezone and converts to UTC.
+ Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+ another timestamp that corresponds to the same time of day in UTC.
>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(to_utc_timestamp(df.t, "PST").alias('t')).collect()
@@ -1314,8 +1318,8 @@ def instr(str, substr):
Locate the position of the first occurrence of substr column in the given string.
Returns null if either of the arguments are null.
- NOTE: The position is not zero based, but 1 based index, returns 0 if substr
- could not be found in str.
+ .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+ could not be found in str.
>>> df = spark.createDataFrame([('abcd',)], ['s',])
>>> df.select(instr(df.s, 'b').alias('s')).collect()
@@ -1379,8 +1383,8 @@ def locate(substr, str, pos=1):
"""
Locate the position of the first occurrence of substr in a string column, after position pos.
- NOTE: The position is not zero based, but 1 based index. returns 0 if substr
- could not be found in str.
+ .. note:: The position is not zero based, but 1 based index. Returns 0 if substr
+ could not be found in str.
:param substr: a string
:param str: a Column of :class:`pyspark.sql.types.StringType`
@@ -1442,7 +1446,7 @@ def split(str, pattern):
"""
Splits str around pattern (pattern is a regular expression).
- NOTE: pattern is a string represent the regular expression.
+ .. note:: pattern is a string represent the regular expression.
>>> df = spark.createDataFrame([('ab12cd',)], ['s',])
>>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
@@ -1785,7 +1789,8 @@ def size(col):
@since(1.5)
def sort_array(col, asc=True):
"""
- Collection function: sorts the input array for the given column in ascending order.
+ Collection function: sorts the input array in ascending or descending order according
+ to the natural ordering of the array elements.
:param col: name of column or expression
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 944a476114..e221c032b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1117,7 +1117,8 @@ object functions {
def not(e: Column): Column = !e
/**
- * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+ * Generate a random column with independent and identically distributed (i.i.d.) samples
+ * from U[0.0, 1.0].
*
* Note that this is indeterministic when data partitions are not fixed.
*
@@ -1127,7 +1128,8 @@ object functions {
def rand(seed: Long): Column = withExpr { Rand(seed) }
/**
- * Generate a random column with i.i.d. samples from U[0.0, 1.0].
+ * Generate a random column with independent and identically distributed (i.i.d.) samples
+ * from U[0.0, 1.0].
*
* @group normal_funcs
* @since 1.4.0
@@ -1135,7 +1137,8 @@ object functions {
def rand(): Column = rand(Utils.random.nextLong)
/**
- * Generate a column with i.i.d. samples from the standard normal distribution.
+ * Generate a column with independent and identically distributed (i.i.d.) samples from
+ * the standard normal distribution.
*
* Note that this is indeterministic when data partitions are not fixed.
*
@@ -1145,7 +1148,8 @@ object functions {
def randn(seed: Long): Column = withExpr { Randn(seed) }
/**
- * Generate a column with i.i.d. samples from the standard normal distribution.
+ * Generate a column with independent and identically distributed (i.i.d.) samples from
+ * the standard normal distribution.
*
* @group normal_funcs
* @since 1.4.0
@@ -1153,7 +1157,7 @@ object functions {
def randn(): Column = randn(Utils.random.nextLong)
/**
- * Partition ID of the Spark task.
+ * Partition ID.
*
* Note that this is indeterministic because it depends on data partitioning and task scheduling.
*
@@ -1877,8 +1881,8 @@ object functions {
def shiftLeft(e: Column, numBits: Int): Column = withExpr { ShiftLeft(e.expr, lit(numBits).expr) }
/**
- * Shift the given value numBits right. If the given value is a long value, it will return
- * a long value else it will return an integer value.
+ * (Signed) shift the given value numBits right. If the given value is a long value, it will
+ * return a long value else it will return an integer value.
*
* @group math_funcs
* @since 1.5.0
@@ -2203,7 +2207,7 @@ object functions {
* Locate the position of the first occurrence of substr column in the given string.
* Returns null if either of the arguments are null.
*
- * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+ * NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
* could not be found in str.
*
* @group string_funcs
@@ -2238,7 +2242,7 @@ object functions {
/**
* Locate the position of the first occurrence of substr.
- * NOTE: The position is not zero based, but 1 based index, returns 0 if substr
+ * NOTE: The position is not zero based, but 1 based index. Returns 0 if substr
* could not be found in str.
*
* @group string_funcs
@@ -2666,7 +2670,8 @@ object functions {
}
/**
- * Assumes given timestamp is UTC and converts to given timezone.
+ * Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
+ * that corresponds to the same time of day in the given timezone.
* @group datetime_funcs
* @since 1.5.0
*/
@@ -2675,7 +2680,8 @@ object functions {
}
/**
- * Assumes given timestamp is in given timezone and converts to UTC.
+ * Given a timestamp, which corresponds to a certain time of day in the given timezone, returns
+ * another timestamp that corresponds to the same time of day in UTC.
* @group datetime_funcs
* @since 1.5.0
*/
@@ -2996,7 +3002,7 @@ object functions {
def sort_array(e: Column): Column = sort_array(e, asc = true)
/**
- * Sorts the input array for the given column in ascending / descending order,
+ * Sorts the input array for the given column in ascending or descending order,
* according to the natural ordering of the array elements.
*
* @group collection_funcs