aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-06-01 10:49:51 -0700
committerNick Pentreath <nickp@za.ibm.com>2016-06-01 10:49:51 -0700
commit07a98ca4ce4e715ce32b4be75010e28764da459b (patch)
tree29f45b7515182db24b62b505d0efb8dbb76f708c /python
parenta71d1364ae87aa388128da34dd0b9b02ff85e458 (diff)
downloadspark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip
[SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature
## What changes were proposed in this pull request? ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include: * Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` to produce bins and ```seed``` is useless. * Scala API docs update. * Sync Scala and Python API docs for these changes. ## How was this patch tested? Exist tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13410 from yanboliang/spark-15587.
Diffstat (limited to 'python')
-rwxr-xr-xpython/pyspark/ml/feature.py29
1 files changed, 13 insertions, 16 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index eb555cb940..1aff2e550f 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -19,8 +19,6 @@ import sys
if sys.version > '3':
basestring = str
-from py4j.java_collections import JavaArray
-
from pyspark import since, keyword_only
from pyspark.rdd import ignore_unicode_prefix
from pyspark.ml.linalg import _convert_to_vector
@@ -159,9 +157,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
"Split points for mapping continuous features into buckets. With n+1 splits, " +
"there are n buckets. A bucket defined by splits x,y holds values in the " +
"range [x,y) except the last bucket, which also includes y. The splits " +
- "should be strictly increasing. Values at -inf, inf must be explicitly " +
- "provided to cover all Double values; otherwise, values outside the splits " +
- "specified will be treated as errors.",
+ "should be of length >= 3 and strictly increasing. Values at -inf, inf must be " +
+ "explicitly provided to cover all Double values; otherwise, values outside the " +
+ "splits specified will be treated as errors.",
typeConverter=TypeConverters.toListFloat)
@keyword_only
@@ -1171,8 +1169,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead
@inherit_doc
-class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable,
- JavaMLWritable):
+class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
@@ -1186,9 +1183,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
>>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> qds = QuantileDiscretizer(numBuckets=2,
- ... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01)
- >>> qds.getSeed()
- 123
+ ... inputCol="values", outputCol="buckets", relativeError=0.01)
>>> qds.getRelativeError()
0.01
>>> bucketizer = qds.fit(df)
@@ -1220,9 +1215,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
typeConverter=TypeConverters.toFloat)
@keyword_only
- def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001):
+ def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
"""
- __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001)
+ __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
@@ -1233,11 +1228,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
@keyword_only
@since("2.0.0")
- def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None,
- relativeError=0.001):
+ def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
"""
- setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \
- relativeError=0.001)
+ setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
Set the params for the QuantileDiscretizer
"""
kwargs = self.setParams._input_kwargs
@@ -1481,6 +1474,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J
Standardizes features by removing the mean and scaling to unit variance using column summary
statistics on the samples in the training set.
+ The "unit std" is computed using the `corrected sample standard deviation \
+ <https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_,
+ which is computed as the square root of the unbiased sample variance.
+
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
>>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")