diff options
author | Davies Liu <davies@databricks.com> | 2015-11-03 13:33:46 -0800 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-11-03 13:33:46 -0800 |
commit | 1d04dc95c0d3caa485936e65b0493bcc9719f27e (patch) | |
tree | ace98f4fe1c54db7c61d867aade0e4210514f553 /python/pyspark/sql/group.py | |
parent | a9676cc7107c5df6c62a58668c4d95ced1238370 (diff) | |
download | spark-1d04dc95c0d3caa485936e65b0493bcc9719f27e.tar.gz spark-1d04dc95c0d3caa485936e65b0493bcc9719f27e.tar.bz2 spark-1d04dc95c0d3caa485936e65b0493bcc9719f27e.zip |
[SPARK-11467][SQL] add Python API for stddev/variance
Add Python API for stddev/stddev_pop/stddev_samp/variance/var_pop/var_samp/skewness/kurtosis
Author: Davies Liu <davies@databricks.com>
Closes #9424 from davies/py_var.
Diffstat (limited to 'python/pyspark/sql/group.py')
-rw-r--r-- | python/pyspark/sql/group.py | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 71c0bccc5e..946b53e71c 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -167,6 +167,94 @@ class GroupedData(object): [Row(sum(age)=7, sum(height)=165)] """ + @df_varargs_api + @since(1.6) + def stddev(self, *cols): + """Compute the sample standard deviation for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().stddev('age', 'height').collect() + [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)] + """ + + @df_varargs_api + @since(1.6) + def stddev_samp(self, *cols): + """Compute the sample standard deviation for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().stddev_samp('age', 'height').collect() + [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)] + """ + + @df_varargs_api + @since(1.6) + def stddev_pop(self, *cols): + """Compute the population standard deviation for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().stddev_pop('age', 'height').collect() + [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)] + """ + + @df_varargs_api + @since(1.6) + def variance(self, *cols): + """Compute the sample variance for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().variance('age', 'height').collect() + [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)] + """ + + @df_varargs_api + @since(1.6) + def var_pop(self, *cols): + """Compute the sample variance for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().var_pop('age', 'height').collect() + [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)] + """ + + @df_varargs_api + @since(1.6) + def var_samp(self, *cols): + """Compute the sample variance for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().var_samp('age', 'height').collect() + [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)] + """ + + @df_varargs_api + @since(1.6) + def skewness(self, *cols): + """Compute the skewness for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().skewness('age', 'height').collect() + [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)] + """ + + @df_varargs_api + @since(1.6) + def kurtosis(self, *cols): + """Compute the kurtosis for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().kurtosis('age', 'height').collect() + [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)] + """ + def _test(): import doctest |