[SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature

## What changes were proposed in this pull request? ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include: * Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` to produce bins and ```seed``` is useless. * Scala API docs update. * Sync Scala and Python API docs for these changes. ## How was this patch tested? Exist tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13410 from yanboliang/spark-15587.
author: Yanbo Liang <ybliang8@gmail.com> 2016-06-01 10:49:51 -0700
committer: Nick Pentreath <nickp@za.ibm.com> 2016-06-01 10:49:51 -0700
commit: 07a98ca4ce4e715ce32b4be75010e28764da459b (patch)
tree: 29f45b7515182db24b62b505d0efb8dbb76f708c /python
parent: a71d1364ae87aa388128da34dd0b9b02ff85e458 (diff)
download: spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip
1 files changed, 13 insertions, 16 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index eb555cb940..1aff2e550f 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -19,8 +19,6 @@ import sys
 if sys.version > '3':
     basestring = str
 
-from py4j.java_collections import JavaArray
-
 from pyspark import since, keyword_only
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.linalg import _convert_to_vector
@@ -159,9 +157,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
               "Split points for mapping continuous features into buckets. With n+1 splits, " +
               "there are n buckets. A bucket defined by splits x,y holds values in the " +
               "range [x,y) except the last bucket, which also includes y. The splits " +
-              "should be strictly increasing. Values at -inf, inf must be explicitly " +
-              "provided to cover all Double values; otherwise, values outside the splits " +
-              "specified will be treated as errors.",
+              "should be of length >= 3 and strictly increasing. Values at -inf, inf must be " +
+              "explicitly provided to cover all Double values; otherwise, values outside the " +
+              "splits specified will be treated as errors.",
               typeConverter=TypeConverters.toListFloat)
 
     @keyword_only
@@ -1171,8 +1169,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead
 
 
 @inherit_doc
-class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable,
-                          JavaMLWritable):
+class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
@@ -1186,9 +1183,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
 
     >>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
     >>> qds = QuantileDiscretizer(numBuckets=2,
-    ...     inputCol="values", outputCol="buckets", seed=123, relativeError=0.01)
-    >>> qds.getSeed()
-    123
+    ...     inputCol="values", outputCol="buckets", relativeError=0.01)
     >>> qds.getRelativeError()
     0.01
     >>> bucketizer = qds.fit(df)
@@ -1220,9 +1215,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
                           typeConverter=TypeConverters.toFloat)
 
     @keyword_only
-    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001):
+    def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
         """
-        __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001)
+        __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
         """
         super(QuantileDiscretizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
@@ -1233,11 +1228,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
 
     @keyword_only
     @since("2.0.0")
-    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None,
-                  relativeError=0.001):
+    def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
         """
-        setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \
-                  relativeError=0.001)
+        setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
         Set the params for the QuantileDiscretizer
         """
         kwargs = self.setParams._input_kwargs
@@ -1481,6 +1474,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J
     Standardizes features by removing the mean and scaling to unit variance using column summary
     statistics on the samples in the training set.
 
+    The "unit std" is computed using the `corrected sample standard deviation \
+    <https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_,
+    which is computed as the square root of the unbiased sample variance.
+
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
     >>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")
author	Yanbo Liang <ybliang8@gmail.com>	2016-06-01 10:49:51 -0700
committer	Nick Pentreath <nickp@za.ibm.com>	2016-06-01 10:49:51 -0700
commit	07a98ca4ce4e715ce32b4be75010e28764da459b (patch)
tree	29f45b7515182db24b62b505d0efb8dbb76f708c /python
parent	a71d1364ae87aa388128da34dd0b9b02ff85e458 (diff)
download	spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2 spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip