diff options
author | Joseph K. Bradley <joseph@databricks.com> | 2016-02-24 23:15:36 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-02-24 23:15:36 -0800 |
commit | 13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f (patch) | |
tree | c24ffc89625ab7b5168edef1e4c0e087bcd49836 /python/pyspark/sql/dataframe.py | |
parent | 2b042577fb077865c3fce69c9d4eda22fde92673 (diff) | |
download | spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.gz spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.bz2 spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.zip |
[SPARK-13479][SQL][PYTHON] Added Python API for approxQuantile
## What changes were proposed in this pull request?
* Scala DataFrameStatFunctions: Added version of approxQuantile taking a List instead of an Array, for Python compatbility
* Python DataFrame and DataFrameStatFunctions: Added approxQuantile
## How was this patch tested?
* unit test in sql/tests.py
Documentation was copied from the existing approxQuantile exactly.
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #11356 from jkbradley/approx-quantile-python.
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 7275e69353..76fbb0c9aa 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1178,6 +1178,55 @@ class DataFrame(object): return DataFrame( self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx) + @since(2.0) + def approxQuantile(self, col, probabilities, relativeError): + """ + Calculates the approximate quantiles of a numerical column of a + DataFrame. + + The result of this algorithm has the following deterministic bound: + If the DataFrame has N elements and if we request the quantile at + probability `p` up to error `err`, then the algorithm will return + a sample `x` from the DataFrame so that the *exact* rank of `x` is + close to (p * N). More precisely, + + floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). + + This method implements a variation of the Greenwald-Khanna + algorithm (with some speed optimizations). The algorithm was first + present in [[http://dx.doi.org/10.1145/375663.375670 + Space-efficient Online Computation of Quantile Summaries]] + by Greenwald and Khanna. + + :param col: the name of the numerical column + :param probabilities: a list of quantile probabilities + Each number must belong to [0, 1]. + For example 0 is the minimum, 0.5 is the median, 1 is the maximum. + :param relativeError: The relative target precision to achieve + (>= 0). If set to zero, the exact quantiles are computed, which + could be very expensive. Note that values greater than 1 are + accepted but give the same result as 1. + :return: the approximate quantiles at the given probabilities + """ + if not isinstance(col, str): + raise ValueError("col should be a string.") + + if not isinstance(probabilities, (list, tuple)): + raise ValueError("probabilities should be a list or tuple") + if isinstance(probabilities, tuple): + probabilities = list(probabilities) + for p in probabilities: + if not isinstance(p, (float, int, long)) or p < 0 or p > 1: + raise ValueError("probabilities should be numerical (float, int, long) in [0,1].") + probabilities = _to_list(self._sc, probabilities) + + if not isinstance(relativeError, (float, int, long)) or relativeError < 0: + raise ValueError("relativeError should be numerical (float, int, long) >= 0.") + relativeError = float(relativeError) + + jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError) + return list(jaq) + @since(1.4) def corr(self, col1, col2, method=None): """ @@ -1396,6 +1445,11 @@ class DataFrameStatFunctions(object): def __init__(self, df): self.df = df + def approxQuantile(self, col, probabilities, relativeError): + return self.df.approxQuantile(col, probabilities, relativeError) + + approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__ + def corr(self, col1, col2, method=None): return self.df.corr(col1, col2, method) |