diff options
author | Zheng RuiFeng <ruifengz@foxmail.com> | 2017-02-01 14:11:28 -0800 |
---|---|---|
committer | Holden Karau <holden@us.ibm.com> | 2017-02-01 14:11:28 -0800 |
commit | b0985764f00acea97df7399a6b337262fc97f5ee (patch) | |
tree | 73351f1e8d031d9e6c912cdd7ed7570cd8c674e1 /python/pyspark/sql/dataframe.py | |
parent | c5fcb7f68bff055cc56e487bd48994945e7935cd (diff) | |
download | spark-b0985764f00acea97df7399a6b337262fc97f5ee.tar.gz spark-b0985764f00acea97df7399a6b337262fc97f5ee.tar.bz2 spark-b0985764f00acea97df7399a6b337262fc97f5ee.zip |
[SPARK-14352][SQL] approxQuantile should support multi columns
## What changes were proposed in this pull request?
1, add the multi-cols support based on current private api
2, add the multi-cols support to pyspark
## How was this patch tested?
unit tests
Author: Zheng RuiFeng <ruifengz@foxmail.com>
Author: Ruifeng Zheng <ruifengz@foxmail.com>
Closes #12135 from zhengruifeng/quantile4multicols.
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 37 |
1 files changed, 30 insertions, 7 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 10e42d0f9d..50373b8585 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -16,7 +16,6 @@ # import sys -import warnings import random if sys.version >= '3': @@ -1348,7 +1347,7 @@ class DataFrame(object): @since(2.0) def approxQuantile(self, col, probabilities, relativeError): """ - Calculates the approximate quantiles of a numerical column of a + Calculates the approximate quantiles of numerical columns of a DataFrame. The result of this algorithm has the following deterministic bound: @@ -1365,7 +1364,10 @@ class DataFrame(object): Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. - :param col: the name of the numerical column + Note that rows containing any null values will be removed before calculation. + + :param col: str, list. + Can be a single column name, or a list of names for multiple columns. :param probabilities: a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the minimum, 0.5 is the median, 1 is the maximum. @@ -1373,10 +1375,30 @@ class DataFrame(object): (>= 0). If set to zero, the exact quantiles are computed, which could be very expensive. Note that values greater than 1 are accepted but give the same result as 1. - :return: the approximate quantiles at the given probabilities + :return: the approximate quantiles at the given probabilities. If + the input `col` is a string, the output is a list of floats. If the + input `col` is a list or tuple of strings, the output is also a + list, but each element in it is a list of floats, i.e., the output + is a list of list of floats. + + .. versionchanged:: 2.2 + Added support for multiple columns. """ - if not isinstance(col, str): - raise ValueError("col should be a string.") + + if not isinstance(col, (str, list, tuple)): + raise ValueError("col should be a string, list or tuple, but got %r" % type(col)) + + isStr = isinstance(col, str) + + if isinstance(col, tuple): + col = list(col) + elif isinstance(col, str): + col = [col] + + for c in col: + if not isinstance(c, str): + raise ValueError("columns should be strings, but got %r" % type(c)) + col = _to_list(self._sc, col) if not isinstance(probabilities, (list, tuple)): raise ValueError("probabilities should be a list or tuple") @@ -1392,7 +1414,8 @@ class DataFrame(object): relativeError = float(relativeError) jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) - return list(jaq) + jaq_list = [list(j) for j in jaq] + return jaq_list[0] if isStr else jaq_list @since(1.4) def corr(self, col1, col2, method=None): |