aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorNick Pentreath <nickp@za.ibm.com>2016-05-24 10:02:10 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-24 10:02:10 +0200
commit6075f5b4d8e98483d26c31576f58e2229024b4f4 (patch)
treeb49308cd5da2fb5ab3ffe80546887016b6a794cd /python
parentd642b273544bb77ef7f584326aa2d214649ac61b (diff)
downloadspark-6075f5b4d8e98483d26c31576f58e2229024b4f4.tar.gz
spark-6075f5b4d8e98483d26c31576f58e2229024b4f4.tar.bz2
spark-6075f5b4d8e98483d26c31576f58e2229024b4f4.zip
[SPARK-15442][ML][PYSPARK] Add 'relativeError' param to PySpark QuantileDiscretizer
This PR adds the `relativeError` param to PySpark's `QuantileDiscretizer` to match Scala. Also cleaned up a duplication of `numBuckets` where the param is both a class and instance attribute (I removed the instance attr to match the style of params throughout `ml`). Finally, cleaned up the docs for `QuantileDiscretizer` to reflect that it now uses `approxQuantile`. ## How was this patch tested? A little doctest and built API docs locally to check HTML doc generation. Author: Nick Pentreath <nickp@za.ibm.com> Closes #13228 from MLnick/SPARK-15442-py-relerror-param.
Diffstat (limited to 'python')
-rwxr-xr-xpython/pyspark/ml/feature.py51
1 files changed, 36 insertions, 15 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 93745c70c4..eb555cb940 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1177,16 +1177,20 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
.. note:: Experimental
`QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
- categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
- into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
- covering all real values. This attempts to find numBuckets partitions based on a sample of data,
- but it may find fewer depending on the data sample values.
+ categorical features. The number of bins can be set using the :py:attr:`numBuckets` parameter.
+ The bin ranges are chosen using an approximate algorithm (see the documentation for
+ :py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description).
+ The precision of the approximation can be controlled with the
+ :py:attr:`relativeError` parameter.
+ The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.
>>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> qds = QuantileDiscretizer(numBuckets=2,
- ... inputCol="values", outputCol="buckets", seed=123)
+ ... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01)
>>> qds.getSeed()
123
+ >>> qds.getRelativeError()
+ 0.01
>>> bucketizer = qds.fit(df)
>>> splits = bucketizer.getSplits()
>>> splits[0]
@@ -1205,32 +1209,35 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
.. versionadded:: 2.0.0
"""
- # a placeholder to make it appear in the generated doc
numBuckets = Param(Params._dummy(), "numBuckets",
"Maximum number of buckets (quantiles, or " +
- "categories) into which data points are grouped. Must be >= 2. Default 2.",
+ "categories) into which data points are grouped. Must be >= 2.",
typeConverter=TypeConverters.toInt)
+ relativeError = Param(Params._dummy(), "relativeError", "The relative target precision for " +
+ "the approximate quantile algorithm used to generate buckets. " +
+ "Must be in the range [0, 1].",
+ typeConverter=TypeConverters.toFloat)
+
@keyword_only
- def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None):
+ def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001):
"""
- __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None)
+ __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001)
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
self.uid)
- self.numBuckets = Param(self, "numBuckets",
- "Maximum number of buckets (quantiles, or " +
- "categories) into which data points are grouped. Must be >= 2.")
- self._setDefault(numBuckets=2)
+ self._setDefault(numBuckets=2, relativeError=0.001)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("2.0.0")
- def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None):
+ def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None,
+ relativeError=0.001):
"""
- setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None)
+ setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \
+ relativeError=0.001)
Set the params for the QuantileDiscretizer
"""
kwargs = self.setParams._input_kwargs
@@ -1250,6 +1257,20 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
"""
return self.getOrDefault(self.numBuckets)
+ @since("2.0.0")
+ def setRelativeError(self, value):
+ """
+ Sets the value of :py:attr:`relativeError`.
+ """
+ return self._set(relativeError=value)
+
+ @since("2.0.0")
+ def getRelativeError(self):
+ """
+ Gets the value of relativeError or its default value.
+ """
+ return self.getOrDefault(self.relativeError)
+
def _create_model(self, java_model):
"""
Private method to convert the java_model to a Python model.