diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2016-06-01 10:49:51 -0700 |
---|---|---|
committer | Nick Pentreath <nickp@za.ibm.com> | 2016-06-01 10:49:51 -0700 |
commit | 07a98ca4ce4e715ce32b4be75010e28764da459b (patch) | |
tree | 29f45b7515182db24b62b505d0efb8dbb76f708c /python/pyspark | |
parent | a71d1364ae87aa388128da34dd0b9b02ff85e458 (diff) | |
download | spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2 spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip |
[SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature
## What changes were proposed in this pull request?
ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include:
* Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` to produce bins and ```seed``` is useless.
* Scala API docs update.
* Sync Scala and Python API docs for these changes.
## How was this patch tested?
Exist tests.
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #13410 from yanboliang/spark-15587.
Diffstat (limited to 'python/pyspark')
-rwxr-xr-x | python/pyspark/ml/feature.py | 29 |
1 files changed, 13 insertions, 16 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index eb555cb940..1aff2e550f 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -19,8 +19,6 @@ import sys if sys.version > '3': basestring = str -from py4j.java_collections import JavaArray - from pyspark import since, keyword_only from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.linalg import _convert_to_vector @@ -159,9 +157,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav "Split points for mapping continuous features into buckets. With n+1 splits, " + "there are n buckets. A bucket defined by splits x,y holds values in the " + "range [x,y) except the last bucket, which also includes y. The splits " + - "should be strictly increasing. Values at -inf, inf must be explicitly " + - "provided to cover all Double values; otherwise, values outside the splits " + - "specified will be treated as errors.", + "should be of length >= 3 and strictly increasing. Values at -inf, inf must be " + + "explicitly provided to cover all Double values; otherwise, values outside the " + + "splits specified will be treated as errors.", typeConverter=TypeConverters.toListFloat) @keyword_only @@ -1171,8 +1169,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead @inherit_doc -class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable, - JavaMLWritable): +class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -1186,9 +1183,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) >>> qds = QuantileDiscretizer(numBuckets=2, - ... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01) - >>> qds.getSeed() - 123 + ... inputCol="values", outputCol="buckets", relativeError=0.01) >>> qds.getRelativeError() 0.01 >>> bucketizer = qds.fit(df) @@ -1220,9 +1215,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav typeConverter=TypeConverters.toFloat) @keyword_only - def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001): + def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001): """ - __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001) + __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001) """ super(QuantileDiscretizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", @@ -1233,11 +1228,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav @keyword_only @since("2.0.0") - def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, - relativeError=0.001): + def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001): """ - setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \ - relativeError=0.001) + setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001) Set the params for the QuantileDiscretizer """ kwargs = self.setParams._input_kwargs @@ -1481,6 +1474,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. + The "unit std" is computed using the `corrected sample standard deviation \ + <https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_, + which is computed as the square root of the unbiased sample variance. + >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled") |