[SPARK-3614][MLLIB] Add minimumOccurence filtering to IDF

This PR for [SPARK-3614](https://issues.apache.org/jira/browse/SPARK-3614) adds functionality for filtering out terms which do not appear in at least a minimum number of documents. This is implemented using a minimumOccurence parameter (default 0). When terms' document frequencies are less than minimumOccurence, their IDFs are set to 0, just like when the DF is 0. As a result, the TF-IDFs for the terms are found to be 0, as if the terms were not present in the documents. This PR makes the following changes: * Add a minimumOccurence parameter to the IDF and DocumentFrequencyAggregator classes. * Create a parameter-less constructor for IDF with a default minimumOccurence value of 0 to remain backwards-compatibility with the original IDF API. * Sets the IDFs to 0 for terms which DFs are less than minimumOccurence * Add tests to the Spark IDFSuite and Java JavaTfIdfSuite test suites * Updated the MLLib Feature Extraction programming guide to describe the new feature Author: RJ Nowling <rnowling@gmail.com> Closes #2494 from rnowling/spark-3614-idf-filter and squashes the following commits: 0aa3c63 [RJ Nowling] Fix identation e6523a8 [RJ Nowling] Remove unnecessary toDouble's from IDFSuite bfa82ec [RJ Nowling] Add space after if 30d20b3 [RJ Nowling] Add spaces around equals signs 9013447 [RJ Nowling] Add space before division operator 79978fc [RJ Nowling] Remove unnecessary semi-colon 40fd70c [RJ Nowling] Change minimumOccurence to minDocFreq in code and docs 47850ab [RJ Nowling] Changed minimumOccurence to Int from Long 9fb4093 [RJ Nowling] Remove unnecessary lines from IDF class docs 1fc09d8 [RJ Nowling] Add backwards-compatible constructor to DocumentFrequencyAggregator 1801fd2 [RJ Nowling] Fix style errors in IDF.scala 6897252 [RJ Nowling] Preface minimumOccurence members with val to make them final and immutable a200bab [RJ Nowling] Remove unnecessary else statement 4b974f5 [RJ Nowling] Remove accidentally-added import from testing c0cc643 [RJ Nowling] Add minimumOccurence filtering to IDF
author: RJ Nowling <rnowling@gmail.com> 2014-09-26 09:58:47 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-09-26 09:58:47 -0700
commit: ec9df6a765701fa41390083df12e1dc1fee50662 (patch)
tree: f0c7743aa58693c8563dd9dee1ead54e36689436 /mllib/src/test
parent: d16e161d744b27291fd2ee7e3578917ee14d83f9 (diff)
download: spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.gz
spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.bz2
spark-ec9df6a765701fa41390083df12e1dc1fee50662.zip
2 files changed, 55 insertions, 1 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
index e8d99f4ae4..064263e02c 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -63,4 +63,24 @@ public class JavaTfIdfSuite implements Serializable {
       Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
     }
   }
+
+  @Test
+  public void tfIdfMinimumDocumentFrequency() {
+    // The tests are to check Java compatibility.
+    HashingTF tf = new HashingTF();
+    JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
+      Lists.newArrayList("this is a sentence".split(" ")),
+      Lists.newArrayList("this is another sentence".split(" ")),
+      Lists.newArrayList("this is still a sentence".split(" "))), 2);
+    JavaRDD<Vector> termFreqs = tf.transform(documents);
+    termFreqs.collect();
+    IDF idf = new IDF(2);
+    JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
+    List<Vector> localTfIdfs = tfIdfs.collect();
+    int indexOfThis = tf.indexOf("this");
+    for (Vector v: localTfIdfs) {
+      Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
+    }
+  }
+
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
index 53d9c0c640..43974f84e3 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
@@ -38,7 +38,7 @@ class IDFSuite extends FunSuite with LocalSparkContext {
     val idf = new IDF
     val model = idf.fit(termFrequencies)
     val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
-      math.log((m.toDouble + 1.0) / (x + 1.0))
+      math.log((m + 1.0) / (x + 1.0))
     })
     assert(model.idf ~== expected absTol 1e-12)
     val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap()
@@ -54,4 +54,38 @@ class IDFSuite extends FunSuite with LocalSparkContext {
     assert(tfidf2.indices === Array(1))
     assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
   }
+
+  test("idf minimum document frequency filtering") {
+    val n = 4
+    val localTermFrequencies = Seq(
+      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
+      Vectors.dense(0.0, 1.0, 2.0, 3.0),
+      Vectors.sparse(n, Array(1), Array(1.0))
+    )
+    val m = localTermFrequencies.size
+    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
+    val idf = new IDF(minDocFreq = 1)
+    val model = idf.fit(termFrequencies)
+    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
+      if (x > 0) {
+        math.log((m + 1.0) / (x + 1.0))
+      } else {
+        0
+      }
+    })
+    assert(model.idf ~== expected absTol 1e-12)
+    val tfidf = model.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap()
+    assert(tfidf.size === 3)
+    val tfidf0 = tfidf(0L).asInstanceOf[SparseVector]
+    assert(tfidf0.indices === Array(1, 3))
+    assert(Vectors.dense(tfidf0.values) ~==
+      Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
+    val tfidf1 = tfidf(1L).asInstanceOf[DenseVector]
+    assert(Vectors.dense(tfidf1.values) ~==
+      Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
+    val tfidf2 = tfidf(2L).asInstanceOf[SparseVector]
+    assert(tfidf2.indices === Array(1))
+    assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+  }
+
 }
author	RJ Nowling <rnowling@gmail.com>	2014-09-26 09:58:47 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-09-26 09:58:47 -0700
commit	ec9df6a765701fa41390083df12e1dc1fee50662 (patch)
tree	f0c7743aa58693c8563dd9dee1ead54e36689436 /mllib/src/test
parent	d16e161d744b27291fd2ee7e3578917ee14d83f9 (diff)
download	spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.gz spark-ec9df6a765701fa41390083df12e1dc1fee50662.tar.bz2 spark-ec9df6a765701fa41390083df12e1dc1fee50662.zip