aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-feature-extraction.md
diff options
context:
space:
mode:
Diffstat (limited to 'docs/mllib-feature-extraction.md')
-rw-r--r--docs/mllib-feature-extraction.md15
1 files changed, 15 insertions, 0 deletions
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 41a27f6208..1511ae6dda 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -82,6 +82,21 @@ tf.cache()
val idf = new IDF().fit(tf)
val tfidf: RDD[Vector] = idf.transform(tf)
{% endhighlight %}
+
+MLLib's IDF implementation provides an option for ignoring terms which occur in less than a
+minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature
+can be used by passing the `minDocFreq` value to the IDF constructor.
+
+{% highlight scala %}
+import org.apache.spark.mllib.feature.IDF
+
+// ... continue from the previous example
+tf.cache()
+val idf = new IDF(minDocFreq = 2).fit(tf)
+val tfidf: RDD[Vector] = idf.transform(tf)
+{% endhighlight %}
+
+
</div>
</div>