diff options
author | Nick Pentreath <nickp@za.ibm.com> | 2016-05-24 10:02:10 +0200 |
---|---|---|
committer | Nick Pentreath <nickp@za.ibm.com> | 2016-05-24 10:02:10 +0200 |
commit | 6075f5b4d8e98483d26c31576f58e2229024b4f4 (patch) | |
tree | b49308cd5da2fb5ab3ffe80546887016b6a794cd /python/pyspark/ml/feature.py | |
parent | d642b273544bb77ef7f584326aa2d214649ac61b (diff) | |
download | spark-6075f5b4d8e98483d26c31576f58e2229024b4f4.tar.gz spark-6075f5b4d8e98483d26c31576f58e2229024b4f4.tar.bz2 spark-6075f5b4d8e98483d26c31576f58e2229024b4f4.zip |
[SPARK-15442][ML][PYSPARK] Add 'relativeError' param to PySpark QuantileDiscretizer
This PR adds the `relativeError` param to PySpark's `QuantileDiscretizer` to match Scala.
Also cleaned up a duplication of `numBuckets` where the param is both a class and instance attribute (I removed the instance attr to match the style of params throughout `ml`).
Finally, cleaned up the docs for `QuantileDiscretizer` to reflect that it now uses `approxQuantile`.
## How was this patch tested?
A little doctest and built API docs locally to check HTML doc generation.
Author: Nick Pentreath <nickp@za.ibm.com>
Closes #13228 from MLnick/SPARK-15442-py-relerror-param.
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rwxr-xr-x | python/pyspark/ml/feature.py | 51 |
1 files changed, 36 insertions, 15 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 93745c70c4..eb555cb940 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1177,16 +1177,20 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav .. note:: Experimental `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned - categorical features. The bin ranges are chosen by taking a sample of the data and dividing it - into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, - covering all real values. This attempts to find numBuckets partitions based on a sample of data, - but it may find fewer depending on the data sample values. + categorical features. The number of bins can be set using the :py:attr:`numBuckets` parameter. + The bin ranges are chosen using an approximate algorithm (see the documentation for + :py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description). + The precision of the approximation can be controlled with the + :py:attr:`relativeError` parameter. + The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values. >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) >>> qds = QuantileDiscretizer(numBuckets=2, - ... inputCol="values", outputCol="buckets", seed=123) + ... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01) >>> qds.getSeed() 123 + >>> qds.getRelativeError() + 0.01 >>> bucketizer = qds.fit(df) >>> splits = bucketizer.getSplits() >>> splits[0] @@ -1205,32 +1209,35 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav .. versionadded:: 2.0.0 """ - # a placeholder to make it appear in the generated doc numBuckets = Param(Params._dummy(), "numBuckets", "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2. Default 2.", + "categories) into which data points are grouped. Must be >= 2.", typeConverter=TypeConverters.toInt) + relativeError = Param(Params._dummy(), "relativeError", "The relative target precision for " + + "the approximate quantile algorithm used to generate buckets. " + + "Must be in the range [0, 1].", + typeConverter=TypeConverters.toFloat) + @keyword_only - def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None): + def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001): """ - __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None) + __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001) """ super(QuantileDiscretizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", self.uid) - self.numBuckets = Param(self, "numBuckets", - "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2.") - self._setDefault(numBuckets=2) + self._setDefault(numBuckets=2, relativeError=0.001) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("2.0.0") - def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None): + def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, + relativeError=0.001): """ - setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None) + setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \ + relativeError=0.001) Set the params for the QuantileDiscretizer """ kwargs = self.setParams._input_kwargs @@ -1250,6 +1257,20 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav """ return self.getOrDefault(self.numBuckets) + @since("2.0.0") + def setRelativeError(self, value): + """ + Sets the value of :py:attr:`relativeError`. + """ + return self._set(relativeError=value) + + @since("2.0.0") + def getRelativeError(self): + """ + Gets the value of relativeError or its default value. + """ + return self.getOrDefault(self.relativeError) + def _create_model(self, java_model): """ Private method to convert the java_model to a Python model. |