From a9b8b655b25f4ed519037faaf7601a3d9842547f Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Sat, 9 Apr 2016 09:57:07 +0200 Subject: [SPARK-14392][ML] CountVectorizer Estimator should include binary toggle Param ## What changes were proposed in this pull request? CountVectorizerModel has a binary toggle param. This PR is to add binary toggle param for estimator CountVectorizer. As discussed in the JIRA, instead of adding a param into CountVerctorizer, I moved the binary param to CountVectorizerParams. Therefore, the estimator inherits the binary param. ## How was this patch tested? Add a new test case, which fits the model with binary flag set to true and then check the trained model's all non-zero counts is set to 1.0. All tests in CounterVectorizerSuite.scala are passed. Author: wm624@hotmail.com Closes #12200 from wangmiao1981/binary_param. --- .../apache/spark/ml/feature/CountVectorizer.scala | 35 +++++++++++----------- .../spark/ml/feature/CountVectorizerSuite.scala | 19 ++++++++++-- 2 files changed, 34 insertions(+), 20 deletions(-) (limited to 'mllib/src') diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index 5694b3890f..f1be971a6a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -100,6 +100,21 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getMinTF: Double = $(minTF) + + /** + * Binary toggle to control the output vector values. + * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for + * discrete probabilistic models that model binary events rather than integer counts. + * Default: false + * @group param + */ + val binary: BooleanParam = + new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.") + + /** @group getParam */ + def getBinary: Boolean = $(binary) + + setDefault(binary -> false) } /** @@ -127,6 +142,9 @@ class CountVectorizer(override val uid: String) /** @group setParam */ def setMinTF(value: Double): this.type = set(minTF, value) + /** @group setParam */ + def setBinary(value: Boolean): this.type = set(binary, value) + setDefault(vocabSize -> (1 << 18), minDF -> 1) override def fit(dataset: DataFrame): CountVectorizerModel = { @@ -206,26 +224,9 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin /** @group setParam */ def setMinTF(value: Double): this.type = set(minTF, value) - /** - * Binary toggle to control the output vector values. - * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for - * discrete probabilistic models that model binary events rather than integer counts. - * Default: false - * @group param - */ - val binary: BooleanParam = - new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + - "This is useful for discrete probabilistic models that model binary events rather " + - "than integer counts") - - /** @group getParam */ - def getBinary: Boolean = $(binary) - /** @group setParam */ def setBinary(value: Boolean): this.type = set(binary, value) - setDefault(binary -> false) - /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */ private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala index 04f165c5f1..ff0de06e27 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala @@ -168,21 +168,34 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } - test("CountVectorizerModel with binary") { + test("CountVectorizerModel and CountVectorizer with binary") { val df = sqlContext.createDataFrame(Seq( - (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))), + (0, split("a a a a b b b b c d"), + Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))), (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))), (2, split("a"), Vectors.sparse(4, Seq((0, 1.0)))) )).toDF("id", "words", "expected") - val cv = new CountVectorizerModel(Array("a", "b", "c", "d")) + // CountVectorizer test + val cv = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setBinary(true) + .fit(df) cv.transform(df).select("features", "expected").collect().foreach { case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) } + + // CountVectorizerModel test + val cv2 = new CountVectorizerModel(cv.vocabulary) + .setInputCol("words") + .setOutputCol("features") + .setBinary(true) + cv2.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => + assert(features ~== expected absTol 1e-14) + } } test("CountVectorizer read/write") { -- cgit v1.2.3