aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2016-02-24 23:15:36 -0800
committerXiangrui Meng <meng@databricks.com>2016-02-24 23:15:36 -0800
commit13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f (patch)
treec24ffc89625ab7b5168edef1e4c0e087bcd49836 /python/pyspark/sql/dataframe.py
parent2b042577fb077865c3fce69c9d4eda22fde92673 (diff)
downloadspark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.gz
spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.bz2
spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.zip
[SPARK-13479][SQL][PYTHON] Added Python API for approxQuantile
## What changes were proposed in this pull request? * Scala DataFrameStatFunctions: Added version of approxQuantile taking a List instead of an Array, for Python compatbility * Python DataFrame and DataFrameStatFunctions: Added approxQuantile ## How was this patch tested? * unit test in sql/tests.py Documentation was copied from the existing approxQuantile exactly. Author: Joseph K. Bradley <joseph@databricks.com> Closes #11356 from jkbradley/approx-quantile-python.
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7275e69353..76fbb0c9aa 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1178,6 +1178,55 @@ class DataFrame(object):
return DataFrame(
self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx)
+ @since(2.0)
+ def approxQuantile(self, col, probabilities, relativeError):
+ """
+ Calculates the approximate quantiles of a numerical column of a
+ DataFrame.
+
+ The result of this algorithm has the following deterministic bound:
+ If the DataFrame has N elements and if we request the quantile at
+ probability `p` up to error `err`, then the algorithm will return
+ a sample `x` from the DataFrame so that the *exact* rank of `x` is
+ close to (p * N). More precisely,
+
+ floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+
+ This method implements a variation of the Greenwald-Khanna
+ algorithm (with some speed optimizations). The algorithm was first
+ present in [[http://dx.doi.org/10.1145/375663.375670
+ Space-efficient Online Computation of Quantile Summaries]]
+ by Greenwald and Khanna.
+
+ :param col: the name of the numerical column
+ :param probabilities: a list of quantile probabilities
+ Each number must belong to [0, 1].
+ For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+ :param relativeError: The relative target precision to achieve
+ (>= 0). If set to zero, the exact quantiles are computed, which
+ could be very expensive. Note that values greater than 1 are
+ accepted but give the same result as 1.
+ :return: the approximate quantiles at the given probabilities
+ """
+ if not isinstance(col, str):
+ raise ValueError("col should be a string.")
+
+ if not isinstance(probabilities, (list, tuple)):
+ raise ValueError("probabilities should be a list or tuple")
+ if isinstance(probabilities, tuple):
+ probabilities = list(probabilities)
+ for p in probabilities:
+ if not isinstance(p, (float, int, long)) or p < 0 or p > 1:
+ raise ValueError("probabilities should be numerical (float, int, long) in [0,1].")
+ probabilities = _to_list(self._sc, probabilities)
+
+ if not isinstance(relativeError, (float, int, long)) or relativeError < 0:
+ raise ValueError("relativeError should be numerical (float, int, long) >= 0.")
+ relativeError = float(relativeError)
+
+ jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
+ return list(jaq)
+
@since(1.4)
def corr(self, col1, col2, method=None):
"""
@@ -1396,6 +1445,11 @@ class DataFrameStatFunctions(object):
def __init__(self, df):
self.df = df
+ def approxQuantile(self, col, probabilities, relativeError):
+ return self.df.approxQuantile(col, probabilities, relativeError)
+
+ approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__
+
def corr(self, col1, col2, method=None):
return self.df.corr(col1, col2, method)