aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuhao Yang <hhbyyh@gmail.com>2016-03-18 17:34:33 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-03-18 17:34:33 -0700
commitf43a26ef9260396761e28aafd5c7b9600c2b04d9 (patch)
treee66b498652aa68b2e5deb332fe590357568dea58
parent54794113a6a906b0f9c6bfb9da322e18e007214c (diff)
downloadspark-f43a26ef9260396761e28aafd5c7b9600c2b04d9.tar.gz
spark-f43a26ef9260396761e28aafd5c7b9600c2b04d9.tar.bz2
spark-f43a26ef9260396761e28aafd5c7b9600c2b04d9.zip
[SPARK-13629][ML] Add binary toggle Param to CountVectorizer
## What changes were proposed in this pull request? This is a continued work for https://github.com/apache/spark/pull/11536#issuecomment-198511013, containing some comment update and style adjustment. jkbradley ## How was this patch tested? unit tests. Author: Yuhao Yang <hhbyyh@gmail.com> Closes #11830 from hhbyyh/cvToggle.
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala23
1 files changed, 9 insertions, 14 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index a3845d3977..5694b3890f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -207,13 +207,12 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
def setMinTF(value: Double): this.type = set(minTF, value)
/**
- * Binary toggle to control the output vector values.
- * If True, all non zero counts are set to 1. This is useful for discrete probabilistic
- * models that model binary events rather than integer counts
- *
- * Default: false
- * @group param
- */
+ * Binary toggle to control the output vector values.
+ * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+ * discrete probabilistic models that model binary events rather than integer counts.
+ * Default: false
+ * @group param
+ */
val binary: BooleanParam =
new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
"This is useful for discrete probabilistic models that model binary events rather " +
@@ -248,17 +247,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
}
tokenCount += 1
}
- val effectiveMinTF = if (minTf >= 1.0) {
- minTf
- } else {
- tokenCount * minTf
- }
+ val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
val effectiveCounts = if ($(binary)) {
termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
- }
- else {
+ } else {
termCounts.filter(_._2 >= effectiveMinTF).toSeq
}
+
Vectors.sparse(dictBr.value.size, effectiveCounts)
}
dataset.withColumn($(outputCol), vectorizer(col($(inputCol))))