[SPARK-3614][MLLIB] Add minimumOccurence filtering to IDF

This PR for [SPARK-3614](https://issues.apache.org/jira/browse/SPARK-3614) adds functionality for filtering out terms which do not appear in at least a minimum number of documents. This is implemented using a minimumOccurence parameter (default 0). When terms' document frequencies are less than minimumOccurence, their IDFs are set to 0, just like when the DF is 0. As a result, the TF-IDFs for the terms are found to be 0, as if the terms were not present in the documents. This PR makes the following changes: * Add a minimumOccurence parameter to the IDF and DocumentFrequencyAggregator classes. * Create a parameter-less constructor for IDF with a default minimumOccurence value of 0 to remain backwards-compatibility with the original IDF API. * Sets the IDFs to 0 for terms which DFs are less than minimumOccurence * Add tests to the Spark IDFSuite and Java JavaTfIdfSuite test suites * Updated the MLLib Feature Extraction programming guide to describe the new feature Author: RJ Nowling <rnowling@gmail.com> Closes #2494 from rnowling/spark-3614-idf-filter and squashes the following commits: 0aa3c63 [RJ Nowling] Fix identation e6523a8 [RJ Nowling] Remove unnecessary toDouble's from IDFSuite bfa82ec [RJ Nowling] Add space after if 30d20b3 [RJ Nowling] Add spaces around equals signs 9013447 [RJ Nowling] Add space before division operator 79978fc [RJ Nowling] Remove unnecessary semi-colon 40fd70c [RJ Nowling] Change minimumOccurence to minDocFreq in code and docs 47850ab [RJ Nowling] Changed minimumOccurence to Int from Long 9fb4093 [RJ Nowling] Remove unnecessary lines from IDF class docs 1fc09d8 [RJ Nowling] Add backwards-compatible constructor to DocumentFrequencyAggregator 1801fd2 [RJ Nowling] Fix style errors in IDF.scala 6897252 [RJ Nowling] Preface minimumOccurence members with val to make them final and immutable a200bab [RJ Nowling] Remove unnecessary else statement 4b974f5 [RJ Nowling] Remove accidentally-added import from testing c0cc643 [RJ Nowling] Add minimumOccurence filtering to IDF
author: RJ Nowling <rnowling@gmail.com> 2014-09-26 09:58:47 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-09-26 09:58:47 -0700
commit: ec9df6a765701fa41390083df12e1dc1fee50662 (patch)
tree: f0c7743aa58693c8563dd9dee1ead54e36689436 /mllib/src/main
parent: d16e161d744b27291fd2ee7e3578917ee14d83f9 (diff)
download: spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.gz
spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.bz2
spark-ec9df6a765701fa41390083df12e1dc1fee50662.zip
1 files changed, 33 insertions, 4 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index d40d5553c1..720bb70b08 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -30,9 +30,18 @@ import org.apache.spark.rdd.RDD
  * Inverse document frequency (IDF).
  * The standard formulation is used: `idf = log((m + 1) / (d(t) + 1))`, where `m` is the total
  * number of documents and `d(t)` is the number of documents that contain term `t`.
+ *
+ * This implementation supports filtering out terms which do not appear in a minimum number
+ * of documents (controlled by the variable `minDocFreq`). For terms that are not in
+ * at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0.
+ *
+ * @param minDocFreq minimum of documents in which a term
+ *                   should appear for filtering
  */
 @Experimental
-class IDF {
+class IDF(val minDocFreq: Int) {
+
+  def this() = this(0)
 
   // TODO: Allow different IDF formulations.
 
@@ -41,7 +50,8 @@ class IDF {
    * @param dataset an RDD of term frequency vectors
    */
   def fit(dataset: RDD[Vector]): IDFModel = {
-    val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator)(
+    val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
+          minDocFreq = minDocFreq))(
       seqOp = (df, v) => df.add(v),
       combOp = (df1, df2) => df1.merge(df2)
     ).idf()
@@ -60,13 +70,16 @@ class IDF {
 private object IDF {
 
   /** Document frequency aggregator. */
-  class DocumentFrequencyAggregator extends Serializable {
+  class DocumentFrequencyAggregator(val minDocFreq: Int) extends Serializable {
 
     /** number of documents */
     private var m = 0L
     /** document frequency vector */
     private var df: BDV[Long] = _
 
+
+    def this() = this(0)
+
     /** Adds a new document. */
     def add(doc: Vector): this.type = {
       if (isEmpty) {
@@ -123,7 +136,18 @@ private object IDF {
       val inv = new Array[Double](n)
       var j = 0
       while (j < n) {
-        inv(j) = math.log((m + 1.0)/ (df(j) + 1.0))
+        /*
+         * If the term is not present in the minimum
+         * number of documents, set IDF to 0. This
+         * will cause multiplication in IDFModel to
+         * set TF-IDF to 0.
+         *
+         * Since arrays are initialized to 0 by default,
+         * we just omit changing those entries.
+         */
+        if(df(j) >= minDocFreq) {
+          inv(j) = math.log((m + 1.0) / (df(j) + 1.0))
+        }
         j += 1
       }
       Vectors.dense(inv)
@@ -140,6 +164,11 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable {
 
   /**
    * Transforms term frequency (TF) vectors to TF-IDF vectors.
+   *
+   * If `minDocFreq` was set for the IDF calculation,
+   * the terms which occur in fewer than `minDocFreq`
+   * documents will have an entry of 0.
+   *
    * @param dataset an RDD of term frequency vectors
    * @return an RDD of TF-IDF vectors
    */
author	RJ Nowling <rnowling@gmail.com>	2014-09-26 09:58:47 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-09-26 09:58:47 -0700
commit	ec9df6a765701fa41390083df12e1dc1fee50662 (patch)
tree	f0c7743aa58693c8563dd9dee1ead54e36689436 /mllib/src/main
parent	d16e161d744b27291fd2ee7e3578917ee14d83f9 (diff)
download	spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.gz spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.bz2 spark-ec9df6a765701fa41390083df12e1dc1fee50662.zip