[SPARK-15587][ML] ML 2.0 QA: Scala APIs audit for ml.feature

## What changes were proposed in this pull request? ML 2.0 QA: Scala APIs audit for ml.feature. Mainly include: * Remove seed for ```QuantileDiscretizer```, since we use ```approxQuantile``` to produce bins and ```seed``` is useless. * Scala API docs update. * Sync Scala and Python API docs for these changes. ## How was this patch tested? Exist tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13410 from yanboliang/spark-15587.
author: Yanbo Liang <ybliang8@gmail.com> 2016-06-01 10:49:51 -0700
committer: Nick Pentreath <nickp@za.ibm.com> 2016-06-01 10:49:51 -0700
commit: 07a98ca4ce4e715ce32b4be75010e28764da459b (patch)
tree: 29f45b7515182db24b62b505d0efb8dbb76f708c /mllib
parent: a71d1364ae87aa388128da34dd0b9b02ff85e458 (diff)
download: spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2
spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip
4 files changed, 10 insertions, 16 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index 10e622ace6..ff988cc815 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -43,7 +43,7 @@ final class Bucketizer(override val uid: String)
   /**
    * Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets.
    * A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which
-   * also includes y. Splits should be strictly increasing.
+   * also includes y. Splits should be of length >= 3 and strictly increasing.
    * Values at -inf, inf must be explicitly provided to cover all Double values;
    * otherwise, values outside the splits specified will be treated as errors.
    * @group param
@@ -51,8 +51,8 @@ final class Bucketizer(override val uid: String)
   val splits: DoubleArrayParam = new DoubleArrayParam(this, "splits",
     "Split points for mapping continuous features into buckets. With n+1 splits, there are n " +
       "buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last " +
-      "bucket, which also includes y. The splits should be strictly increasing. " +
-      "Values at -inf, inf must be explicitly provided to cover all Double values; " +
+      "bucket, which also includes y. The splits should be of length >= 3 and strictly " +
+      "increasing. Values at -inf, inf must be explicitly provided to cover all Double values; " +
       "otherwise, values outside the splits specified will be treated as errors.",
     Bucketizer.checkSplits)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index fc4885bf4b..272567d09c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -56,7 +56,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * If this is an integer >= 1, this specifies the number of documents the term must appear in;
    * if this is a double in [0,1), then this specifies the fraction of documents.
    *
-   * Default: 1
+   * Default: 1.0
    * @group param
    */
   val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" +
@@ -86,7 +86,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
    * Note that the parameter is only used in transform of [[CountVectorizerModel]] and does not
    * affect fitting.
    *
-   * Default: 1
+   * Default: 1.0
    * @group param
    */
   val minTF: DoubleParam = new DoubleParam(this, "minTF", "Filter to ignore rare words in" +
@@ -96,8 +96,6 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
     " of the document's token count). Note that the parameter is only used in transform of" +
     " CountVectorizerModel and does not affect fitting.", ParamValidators.gtEq(0.0))
 
-  setDefault(minTF -> 1)
-
   /** @group getParam */
   def getMinTF: Double = $(minTF)
 
@@ -114,7 +112,7 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   /** @group getParam */
   def getBinary: Boolean = $(binary)
 
-  setDefault(binary -> false)
+  setDefault(vocabSize -> (1 << 18), minDF -> 1.0, minTF -> 1.0, binary -> false)
 }
 
 /**
@@ -145,8 +143,6 @@ class CountVectorizer(override val uid: String)
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
-  setDefault(vocabSize -> (1 << 18), minDF -> 1)
-
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): CountVectorizerModel = {
     transformSchema(dataset.schema, logging = true)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 61483590cd..1fefaa1fdd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.{DoubleType, StructType}
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types.{DoubleType, StructType}
  * Params for [[QuantileDiscretizer]].
  */
 private[feature] trait QuantileDiscretizerBase extends Params
-  with HasInputCol with HasOutputCol with HasSeed {
+  with HasInputCol with HasOutputCol {
 
   /**
    * Number of buckets (quantiles, or categories) into which data points are grouped. Must
@@ -91,9 +91,6 @@ final class QuantileDiscretizer(override val uid: String)
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group setParam */
-  def setSeed(value: Long): this.type = set(seed, value)
-
   override def transformSchema(schema: StructType): StructType = {
     SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType)
     val inputFields = schema.fields
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 1b929cdfff..2d89eb05a5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -51,7 +51,8 @@ private[feature] trait Word2VecBase extends Params
   def getVectorSize: Int = $(vectorSize)
 
   /**
-   * The window size (context words from [-window, window]) default 5.
+   * The window size (context words from [-window, window]).
+   * Default: 5
    * @group expertParam
    */
   final val windowSize = new IntParam(
author	Yanbo Liang <ybliang8@gmail.com>	2016-06-01 10:49:51 -0700
committer	Nick Pentreath <nickp@za.ibm.com>	2016-06-01 10:49:51 -0700
commit	07a98ca4ce4e715ce32b4be75010e28764da459b (patch)
tree	29f45b7515182db24b62b505d0efb8dbb76f708c /mllib
parent	a71d1364ae87aa388128da34dd0b9b02ff85e458 (diff)
download	spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.gz spark-07a98ca4ce4e715ce32b4be75010e28764da459b.tar.bz2 spark-07a98ca4ce4e715ce32b4be75010e28764da459b.zip