aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/feature.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/mllib/feature.py')
-rw-r--r--python/pyspark/mllib/feature.py13
1 files changed, 12 insertions, 1 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 6129353525..b3dd2f63a5 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -379,6 +379,17 @@ class HashingTF(object):
"""
def __init__(self, numFeatures=1 << 20):
self.numFeatures = numFeatures
+ self.binary = False
+
+ @since("2.0.0")
+ def setBinary(self, value):
+ """
+ If True, term frequency vector will be binary such that non-zero
+ term counts will be set to 1
+ (default: False)
+ """
+ self.binary = value
+ return self
@since('1.2.0')
def indexOf(self, term):
@@ -398,7 +409,7 @@ class HashingTF(object):
freq = {}
for term in document:
i = self.indexOf(term)
- freq[i] = freq.get(i, 0) + 1.0
+ freq[i] = 1.0 if self.binary else freq.get(i, 0) + 1.0
return Vectors.sparse(self.numFeatures, freq.items())