aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/group.py
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2015-11-03 13:33:46 -0800
committerReynold Xin <rxin@databricks.com>2015-11-03 13:33:46 -0800
commit1d04dc95c0d3caa485936e65b0493bcc9719f27e (patch)
treeace98f4fe1c54db7c61d867aade0e4210514f553 /python/pyspark/sql/group.py
parenta9676cc7107c5df6c62a58668c4d95ced1238370 (diff)
downloadspark-1d04dc95c0d3caa485936e65b0493bcc9719f27e.tar.gz
spark-1d04dc95c0d3caa485936e65b0493bcc9719f27e.tar.bz2
spark-1d04dc95c0d3caa485936e65b0493bcc9719f27e.zip
[SPARK-11467][SQL] add Python API for stddev/variance
Add Python API for stddev/stddev_pop/stddev_samp/variance/var_pop/var_samp/skewness/kurtosis Author: Davies Liu <davies@databricks.com> Closes #9424 from davies/py_var.
Diffstat (limited to 'python/pyspark/sql/group.py')
-rw-r--r--python/pyspark/sql/group.py88
1 files changed, 88 insertions, 0 deletions
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 71c0bccc5e..946b53e71c 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,6 +167,94 @@ class GroupedData(object):
[Row(sum(age)=7, sum(height)=165)]
"""
+ @df_varargs_api
+ @since(1.6)
+ def stddev(self, *cols):
+ """Compute the sample standard deviation for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().stddev('age', 'height').collect()
+ [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def stddev_samp(self, *cols):
+ """Compute the sample standard deviation for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().stddev_samp('age', 'height').collect()
+ [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def stddev_pop(self, *cols):
+ """Compute the population standard deviation for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().stddev_pop('age', 'height').collect()
+ [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def variance(self, *cols):
+ """Compute the sample variance for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().variance('age', 'height').collect()
+ [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def var_pop(self, *cols):
+ """Compute the sample variance for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().var_pop('age', 'height').collect()
+ [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def var_samp(self, *cols):
+ """Compute the sample variance for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().var_samp('age', 'height').collect()
+ [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def skewness(self, *cols):
+ """Compute the skewness for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().skewness('age', 'height').collect()
+ [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
+ """
+
+ @df_varargs_api
+ @since(1.6)
+ def kurtosis(self, *cols):
+ """Compute the kurtosis for each numeric columns for each group.
+
+ :param cols: list of column names (string). Non-numeric columns are ignored.
+
+ >>> df3.groupBy().kurtosis('age', 'height').collect()
+ [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
+ """
+
def _test():
import doctest