[SPARK-13479][SQL][PYTHON] Added Python API for approxQuantile

## What changes were proposed in this pull request? * Scala DataFrameStatFunctions: Added version of approxQuantile taking a List instead of an Array, for Python compatbility * Python DataFrame and DataFrameStatFunctions: Added approxQuantile ## How was this patch tested? * unit test in sql/tests.py Documentation was copied from the existing approxQuantile exactly. Author: Joseph K. Bradley <joseph@databricks.com> Closes #11356 from jkbradley/approx-quantile-python.
author: Joseph K. Bradley <joseph@databricks.com> 2016-02-24 23:15:36 -0800
committer: Xiangrui Meng <meng@databricks.com> 2016-02-24 23:15:36 -0800
commit: 13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f (patch)
tree: c24ffc89625ab7b5168edef1e4c0e087bcd49836 /python/pyspark/sql/dataframe.py
parent: 2b042577fb077865c3fce69c9d4eda22fde92673 (diff)
download: spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.gz
spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.bz2
spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.zip
1 files changed, 54 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 7275e69353..76fbb0c9aa 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1178,6 +1178,55 @@ class DataFrame(object):
         return DataFrame(
             self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx)
 
+    @since(2.0)
+    def approxQuantile(self, col, probabilities, relativeError):
+        """
+        Calculates the approximate quantiles of a numerical column of a
+        DataFrame.
+
+        The result of this algorithm has the following deterministic bound:
+        If the DataFrame has N elements and if we request the quantile at
+        probability `p` up to error `err`, then the algorithm will return
+        a sample `x` from the DataFrame so that the *exact* rank of `x` is
+        close to (p * N). More precisely,
+
+          floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+
+        This method implements a variation of the Greenwald-Khanna
+        algorithm (with some speed optimizations). The algorithm was first
+        present in [[http://dx.doi.org/10.1145/375663.375670
+        Space-efficient Online Computation of Quantile Summaries]]
+        by Greenwald and Khanna.
+
+        :param col: the name of the numerical column
+        :param probabilities: a list of quantile probabilities
+          Each number must belong to [0, 1].
+          For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+        :param relativeError:  The relative target precision to achieve
+          (>= 0). If set to zero, the exact quantiles are computed, which
+          could be very expensive. Note that values greater than 1 are
+          accepted but give the same result as 1.
+        :return:  the approximate quantiles at the given probabilities
+        """
+        if not isinstance(col, str):
+            raise ValueError("col should be a string.")
+
+        if not isinstance(probabilities, (list, tuple)):
+            raise ValueError("probabilities should be a list or tuple")
+        if isinstance(probabilities, tuple):
+            probabilities = list(probabilities)
+        for p in probabilities:
+            if not isinstance(p, (float, int, long)) or p < 0 or p > 1:
+                raise ValueError("probabilities should be numerical (float, int, long) in [0,1].")
+        probabilities = _to_list(self._sc, probabilities)
+
+        if not isinstance(relativeError, (float, int, long)) or relativeError < 0:
+            raise ValueError("relativeError should be numerical (float, int, long) >= 0.")
+        relativeError = float(relativeError)
+
+        jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)
+        return list(jaq)
+
     @since(1.4)
     def corr(self, col1, col2, method=None):
         """
@@ -1396,6 +1445,11 @@ class DataFrameStatFunctions(object):
     def __init__(self, df):
         self.df = df
 
+    def approxQuantile(self, col, probabilities, relativeError):
+        return self.df.approxQuantile(col, probabilities, relativeError)
+
+    approxQuantile.__doc__ = DataFrame.approxQuantile.__doc__
+
     def corr(self, col1, col2, method=None):
         return self.df.corr(col1, col2, method)
author	Joseph K. Bradley <joseph@databricks.com>	2016-02-24 23:15:36 -0800
committer	Xiangrui Meng <meng@databricks.com>	2016-02-24 23:15:36 -0800
commit	13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f (patch)
tree	c24ffc89625ab7b5168edef1e4c0e087bcd49836 /python/pyspark/sql/dataframe.py
parent	2b042577fb077865c3fce69c9d4eda22fde92673 (diff)
download	spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.gz spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.tar.bz2 spark-13ce10e95401b21fa40ca0bb27ebf9a0bfffe70f.zip