aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-06-01 10:49:51 -0700
committerNick Pentreath <nickp@za.ibm.com>2016-06-01 10:49:51 -0700
commit07a98ca4ce4e715ce32b4be75010e28764da459b (patch)
tree29f45b7515182db24b62b505d0efb8dbb76f708c
parenta71d1364ae87aa388128da34dd0b9b02ff85e458 (diff)
downloadspark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip
[SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature
## What changes were proposed in this pull request? ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include: * Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` to produce bins and ```seed``` is useless. * Scala API docs update. * Sync Scala and Python API docs for these changes. ## How was this patch tested? Exist tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13410 from yanboliang/spark-15587.
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala3
-rwxr-xr-xpython/pyspark/ml/feature.py29
5 files changed, 23 insertions, 32 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 10e622ace6..ff988cc815 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -43,7 +43,7 @@ final class Bucketizer(override val uid: String)
/**
* Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
* A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
- * also includes y. Splits should be strictly increasing.
+ * also includes y. Splits should be of length >= 3 and strictly increasing.
* Values at -inf, inf must be explicitly provided to cover all Double values;
* otherwise, values outside the splits specified will be treated as errors.
* @group param
@@ -51,8 +51,8 @@ final class Bucketizer(override val uid: String)
val splits: DoubleArrayParam = new DoubleArrayParam(this, "splits",
"Split points for mapping continuous features into buckets. With n+1 splits, there are n " +
"buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last " +
- "bucket, which also includes y. The splits should be strictly increasing. " +
- "Values at -inf, inf must be explicitly provided to cover all Double values; " +
+ "bucket, which also includes y. The splits should be of length >= 3 and strictly " +
+ "increasing. Values at -inf, inf must be explicitly provided to cover all Double values; " +
"otherwise, values outside the splits specified will be treated as errors.",
Bucketizer.checkSplits)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index fc4885bf4b..272567d09c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -56,7 +56,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
* If this is an integer >= 1, this specifies the number of documents the term must appear in;
* if this is a double in [0,1), then this specifies the fraction of documents.
*
- * Default: 1
+ * Default: 1.0
* @group param
*/
val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" +
@@ -86,7 +86,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
* Note that the parameter is only used in transform of [[CountVectorizerModel]] and does not
* affect fitting.
*
- * Default: 1
+ * Default: 1.0
* @group param
*/
val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" +
@@ -96,8 +96,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
" of the document's token count). Note that the parameter is only used in transform of" +
" CountVectorizerModel and does not affect fitting.", ParamValidators.gtEq(0.0))
- setDefault(minTF -> 1)
-
/** @group getParam */
def getMinTF: Double = $(minTF)
@@ -114,7 +112,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/** @group getParam */
def getBinary: Boolean = $(binary)
- setDefault(binary -> false)
+ setDefault(vocabSize -> (1 << 18), minDF -> 1.0, minTF -> 1.0, binary -> false)
}
/**
@@ -145,8 +143,6 @@ class CountVectorizer(override val uid: String)
/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)
- setDefault(vocabSize -> (1 << 18), minDF -> 1)
-
@Since("2.0.0")
override def fit(dataset: Dataset[_]): CountVectorizerModel = {
transformSchema(dataset.schema, logging = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 61483590cd..1fefaa1fdd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
import org.apache.spark.ml._
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{DoubleType, StructType}
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.{DoubleType, StructType}
* Params for [[QuantileDiscretizer]].
*/
private[feature] trait QuantileDiscretizerBase extends Params
- with HasInputCol with HasOutputCol with HasSeed {
+ with HasInputCol with HasOutputCol {
/**
* Number of buckets (quantiles, or categories) into which data points are grouped. Must
@@ -91,9 +91,6 @@ final class QuantileDiscretizer(override val uid: String)
/** @group setParam */
def setOutputCol(value: String): this.type = set(outputCol, value)
- /** @group setParam */
- def setSeed(value: Long): this.type = set(seed, value)
-
override def transformSchema(schema: StructType): StructType = {
SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType)
val inputFields = schema.fields
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 1b929cdfff..2d89eb05a5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -51,7 +51,8 @@ private[feature] trait Word2VecBase extends Params
def getVectorSize: Int = $(vectorSize)
/**
- * The window size (context words from [-window, window]) default 5.
+ * The window size (context words from [-window, window]).
+ * Default: 5
* @group expertParam
*/
final val windowSize = new IntParam(
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index eb555cb940..1aff2e550f 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -19,8 +19,6 @@ import sys
if sys.version > '3':
basestring = str
-from py4j.java_collections import JavaArray
-
from pyspark import since, keyword_only
from pyspark.rdd import ignore_unicode_prefix
from pyspark.ml.linalg import _convert_to_vector
@@ -159,9 +157,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
"Split points for mapping continuous features into buckets. With n+1 splits, " +
"there are n buckets. A bucket defined by splits x,y holds values in the " +
"range [x,y) except the last bucket, which also includes y. The splits " +
- "should be strictly increasing. Values at -inf, inf must be explicitly " +
- "provided to cover all Double values; otherwise, values outside the splits " +
- "specified will be treated as errors.",
+ "should be of length >= 3 and strictly increasing. Values at -inf, inf must be " +
+ "explicitly provided to cover all Double values; otherwise, values outside the " +
+ "splits specified will be treated as errors.",
typeConverter=TypeConverters.toListFloat)
@keyword_only
@@ -1171,8 +1169,7 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead
@inherit_doc
-class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, JavaMLReadable,
- JavaMLWritable):
+class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental
@@ -1186,9 +1183,7 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
>>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> qds = QuantileDiscretizer(numBuckets=2,
- ... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01)
- >>> qds.getSeed()
- 123
+ ... inputCol="values", outputCol="buckets", relativeError=0.01)
>>> qds.getRelativeError()
0.01
>>> bucketizer = qds.fit(df)
@@ -1220,9 +1215,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
typeConverter=TypeConverters.toFloat)
@keyword_only
- def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001):
+ def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
"""
- __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001)
+ __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
@@ -1233,11 +1228,9 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
@keyword_only
@since("2.0.0")
- def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None,
- relativeError=0.001):
+ def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
"""
- setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \
- relativeError=0.001)
+ setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
Set the params for the QuantileDiscretizer
"""
kwargs = self.setParams._input_kwargs
@@ -1481,6 +1474,10 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J
Standardizes features by removing the mean and scaling to unit variance using column summary
statistics on the samples in the training set.
+ The "unit std" is computed using the `corrected sample standard deviation \
+ <https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation>`_,
+ which is computed as the square root of the unbiased sample variance.
+
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
>>> standardScaler = StandardScaler(inputCol="a", outputCol="scaled")