diff options
author | Yuhao Yang <hhbyyh@gmail.com> | 2016-03-18 17:34:33 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-03-18 17:34:33 -0700 |
commit | f43a26ef9260396761e28aafd5c7b9600c2b04d9 (patch) | |
tree | e66b498652aa68b2e5deb332fe590357568dea58 /mllib/src | |
parent | 54794113a6a906b0f9c6bfb9da322e18e007214c (diff) | |
download | spark-f43a26ef9260396761e28aafd5c7b9600c2b04d9.tar.gz spark-f43a26ef9260396761e28aafd5c7b9600c2b04d9.tar.bz2 spark-f43a26ef9260396761e28aafd5c7b9600c2b04d9.zip |
[SPARK-13629][ML] Add binary toggle Param to CountVectorizer
## What changes were proposed in this pull request?
This is a continued work for https://github.com/apache/spark/pull/11536#issuecomment-198511013,
containing some comment update and style adjustment.
jkbradley
## How was this patch tested?
unit tests.
Author: Yuhao Yang <hhbyyh@gmail.com>
Closes #11830 from hhbyyh/cvToggle.
Diffstat (limited to 'mllib/src')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala | 23 |
1 files changed, 9 insertions, 14 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala index a3845d3977..5694b3890f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -207,13 +207,12 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin def setMinTF(value: Double): this.type = set(minTF, value) /** - * Binary toggle to control the output vector values. - * If True, all non zero counts are set to 1. This is useful for discrete probabilistic - * models that model binary events rather than integer counts - * - * Default: false - * @group param - */ + * Binary toggle to control the output vector values. + * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for + * discrete probabilistic models that model binary events rather than integer counts. + * Default: false + * @group param + */ val binary: BooleanParam = new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " + "This is useful for discrete probabilistic models that model binary events rather " + @@ -248,17 +247,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin } tokenCount += 1 } - val effectiveMinTF = if (minTf >= 1.0) { - minTf - } else { - tokenCount * minTf - } + val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf val effectiveCounts = if ($(binary)) { termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq - } - else { + } else { termCounts.filter(_._2 >= effectiveMinTF).toSeq } + Vectors.sparse(dictBr.value.size, effectiveCounts) } dataset.withColumn($(outputCol), vectorizer(col($(inputCol)))) |