From f43a26ef9260396761e28aafd5c7b9600c2b04d9 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 18 Mar 2016 17:34:33 -0700
Subject: [SPARK-13629][ML] Add binary toggle Param to CountVectorizer

## What changes were proposed in this pull request?

This is a continued work for https://github.com/apache/spark/pull/11536#issuecomment-198511013,
containing some comment update and style adjustment.
jkbradley

## How was this patch tested?

unit tests.

Author: Yuhao Yang <hhbyyh@gmail.com>

Closes #11830 from hhbyyh/cvToggle.
---
 .../apache/spark/ml/feature/CountVectorizer.scala  | 23 +++++++++-------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'mllib/src')

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index a3845d3977..5694b3890f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -207,13 +207,12 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   def setMinTF(value: Double): this.type = set(minTF, value)
 
   /**
-    * Binary toggle to control the output vector values.
-    * If True, all non zero counts are set to 1. This is useful for discrete probabilistic
-    * models that model binary events rather than integer counts
-    *
-    * Default: false
-    * @group param
-    */
+   * Binary toggle to control the output vector values.
+   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+   * discrete probabilistic models that model binary events rather than integer counts.
+   * Default: false
+   * @group param
+   */
   val binary: BooleanParam =
     new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
       "This is useful for discrete probabilistic models that model binary events rather " +
@@ -248,17 +247,13 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
         }
         tokenCount += 1
       }
-      val effectiveMinTF = if (minTf >= 1.0) {
-        minTf
-      } else {
-        tokenCount * minTf
-      }
+      val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
       val effectiveCounts = if ($(binary)) {
         termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
-      }
-      else {
+      } else {
         termCounts.filter(_._2 >= effectiveMinTF).toSeq
       }
+
       Vectors.sparse(dictBr.value.size, effectiveCounts)
     }
     dataset.withColumn($(outputCol), vectorizer(col($(inputCol))))
-- 
cgit v1.2.3