aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorwm624@hotmail.com <wm624@hotmail.com>2016-04-09 09:57:07 +0200
committerNick Pentreath <nick.pentreath@gmail.com>2016-04-09 09:57:07 +0200
commita9b8b655b25f4ed519037faaf7601a3d9842547f (patch)
tree3a50f1327b9869b61859db401f72c30fbd14e0d9 /mllib
parent90c0a04506a4972b7a2ac2b7dda0c5f8509a6e2f (diff)
downloadspark-a9b8b655b25f4ed519037faaf7601a3d9842547f.tar.gz
spark-a9b8b655b25f4ed519037faaf7601a3d9842547f.tar.bz2
spark-a9b8b655b25f4ed519037faaf7601a3d9842547f.zip
[SPARK-14392][ML] CountVectorizer Estimator should include binary toggle Param
## What changes were proposed in this pull request? CountVectorizerModel has a binary toggle param. This PR is to add binary toggle param for estimator CountVectorizer. As discussed in the JIRA, instead of adding a param into CountVerctorizer, I moved the binary param to CountVectorizerParams. Therefore, the estimator inherits the binary param. ## How was this patch tested? Add a new test case, which fits the model with binary flag set to true and then check the trained model's all non-zero counts is set to 1.0. All tests in CounterVectorizerSuite.scala are passed. Author: wm624@hotmail.com <wm624@hotmail.com> Closes #12200 from wangmiao1981/binary_param.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala35
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala19
2 files changed, 34 insertions, 20 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 5694b3890f..f1be971a6a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -100,6 +100,21 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
/** @group getParam */
def getMinTF: Double = $(minTF)
+
+ /**
+ * Binary toggle to control the output vector values.
+ * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+ * discrete probabilistic models that model binary events rather than integer counts.
+ * Default: false
+ * @group param
+ */
+ val binary: BooleanParam =
+ new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")
+
+ /** @group getParam */
+ def getBinary: Boolean = $(binary)
+
+ setDefault(binary -> false)
}
/**
@@ -127,6 +142,9 @@ class CountVectorizer(override val uid: String)
/** @group setParam */
def setMinTF(value: Double): this.type = set(minTF, value)
+ /** @group setParam */
+ def setBinary(value: Boolean): this.type = set(binary, value)
+
setDefault(vocabSize -> (1 << 18), minDF -> 1)
override def fit(dataset: DataFrame): CountVectorizerModel = {
@@ -206,26 +224,9 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
/** @group setParam */
def setMinTF(value: Double): this.type = set(minTF, value)
- /**
- * Binary toggle to control the output vector values.
- * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
- * discrete probabilistic models that model binary events rather than integer counts.
- * Default: false
- * @group param
- */
- val binary: BooleanParam =
- new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
- "This is useful for discrete probabilistic models that model binary events rather " +
- "than integer counts")
-
- /** @group getParam */
- def getBinary: Boolean = $(binary)
-
/** @group setParam */
def setBinary(value: Boolean): this.type = set(binary, value)
- setDefault(binary -> false)
-
/** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 04f165c5f1..ff0de06e27 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -168,21 +168,34 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
}
}
- test("CountVectorizerModel with binary") {
+ test("CountVectorizerModel and CountVectorizer with binary") {
val df = sqlContext.createDataFrame(Seq(
- (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
+ (0, split("a a a a b b b b c d"),
+ Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
(1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
(2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
)).toDF("id", "words", "expected")
- val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
+ // CountVectorizer test
+ val cv = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setBinary(true)
+ .fit(df)
cv.transform(df).select("features", "expected").collect().foreach {
case Row(features: Vector, expected: Vector) =>
assert(features ~== expected absTol 1e-14)
}
+
+ // CountVectorizerModel test
+ val cv2 = new CountVectorizerModel(cv.vocabulary)
+ .setInputCol("words")
+ .setOutputCol("features")
+ .setBinary(true)
+ cv2.transform(df).select("features", "expected").collect().foreach {
+ case Row(features: Vector, expected: Vector) =>
+ assert(features ~== expected absTol 1e-14)
+ }
}
test("CountVectorizer read/write") {