[SPARK-2511][MLLIB] add HashingTF and IDF

This is roughly the TF-IDF implementation used in the Databricks Cloud Demo: http://databricks.com/cloud/ . Both `HashingTF` and `IDF` are implemented as transformers, similar to scikit-learn. Author: Xiangrui Meng <meng@databricks.com> Closes #1671 from mengxr/tfidf and squashes the following commits: 7d65888 [Xiangrui Meng] use JavaConverters._ 5fe9ec4 [Xiangrui Meng] fix unit test 6e214ec [Xiangrui Meng] add apache header cfd9aed [Xiangrui Meng] add Java-friendly methods move classes to mllib.feature 3814440 [Xiangrui Meng] add HashingTF and IDF
author: Xiangrui Meng <meng@databricks.com> 2014-07-31 12:55:00 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-07-31 12:55:00 -0700
commit: dc0865bc7e119fe507061c27069c17523b87dfea (patch)
tree: 481dfc65f65273dda1fbfae7e22c780aee7f7168 /mllib/src/test
parent: e5749a1342327263dc6b94ba470e392fbea703fa (diff)
download: spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.gz
spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.bz2
spark-dc0865bc7e119fe507061c27069c17523b87dfea.zip
3 files changed, 181 insertions, 0 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
new file mode 100644
index 0000000000..e8d99f4ae4
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+
+public class JavaTfIdfSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaTfIdfSuite");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  public void tfIdf() {
+    // The tests are to check Java compatibility.
+    HashingTF tf = new HashingTF();
+    JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
+      Lists.newArrayList("this is a sentence".split(" ")),
+      Lists.newArrayList("this is another sentence".split(" ")),
+      Lists.newArrayList("this is still a sentence".split(" "))), 2);
+    JavaRDD<Vector> termFreqs = tf.transform(documents);
+    termFreqs.collect();
+    IDF idf = new IDF();
+    JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
+    List<Vector> localTfIdfs = tfIdfs.collect();
+    int indexOfThis = tf.indexOf("this");
+    for (Vector v: localTfIdfs) {
+      Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
new file mode 100644
index 0000000000..a599e0d938
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/HashingTFSuite.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.LocalSparkContext
+
+class HashingTFSuite extends FunSuite with LocalSparkContext {
+
+  test("hashing tf on a single doc") {
+    val hashingTF = new HashingTF(1000)
+    val doc = "a a b b c d".split(" ")
+    val n = hashingTF.numFeatures
+    val termFreqs = Seq(
+      (hashingTF.indexOf("a"), 2.0),
+      (hashingTF.indexOf("b"), 2.0),
+      (hashingTF.indexOf("c"), 1.0),
+      (hashingTF.indexOf("d"), 1.0))
+    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
+      "index must be in range [0, #features)")
+    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
+    val expected = Vectors.sparse(n, termFreqs)
+    assert(hashingTF.transform(doc) === expected)
+  }
+
+  test("hashing tf on an RDD") {
+    val hashingTF = new HashingTF
+    val localDocs: Seq[Seq[String]] = Seq(
+      "a a b b b c d".split(" "),
+      "a b c d a b c".split(" "),
+      "c b a c b a a".split(" "))
+    val docs = sc.parallelize(localDocs, 2)
+    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
new file mode 100644
index 0000000000..78a2804ff2
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class IDFSuite extends FunSuite with LocalSparkContext {
+
+  test("idf") {
+    val n = 4
+    val localTermFrequencies = Seq(
+      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
+      Vectors.dense(0.0, 1.0, 2.0, 3.0),
+      Vectors.sparse(n, Array(1), Array(1.0))
+    )
+    val m = localTermFrequencies.size
+    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
+    val idf = new IDF
+    intercept[IllegalStateException] {
+      idf.idf()
+    }
+    intercept[IllegalStateException] {
+      idf.transform(termFrequencies)
+    }
+    idf.fit(termFrequencies)
+    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
+      math.log((m.toDouble + 1.0) / (x + 1.0))
+    })
+    assert(idf.idf() ~== expected absTol 1e-12)
+    val tfidf = idf.transform(termFrequencies).cache().zipWithIndex().map(_.swap).collectAsMap()
+    assert(tfidf.size === 3)
+    val tfidf0 = tfidf(0L).asInstanceOf[SparseVector]
+    assert(tfidf0.indices === Array(1, 3))
+    assert(Vectors.dense(tfidf0.values) ~==
+      Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
+    val tfidf1 = tfidf(1L).asInstanceOf[DenseVector]
+    assert(Vectors.dense(tfidf1.values) ~==
+      Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
+    val tfidf2 = tfidf(2L).asInstanceOf[SparseVector]
+    assert(tfidf2.indices === Array(1))
+    assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
+  }
+}
author	Xiangrui Meng <meng@databricks.com>	2014-07-31 12:55:00 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-07-31 12:55:00 -0700
commit	dc0865bc7e119fe507061c27069c17523b87dfea (patch)
tree	481dfc65f65273dda1fbfae7e22c780aee7f7168 /mllib/src/test
parent	e5749a1342327263dc6b94ba470e392fbea703fa (diff)
download	spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.gz spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.bz2 spark-dc0865bc7e119fe507061c27069c17523b87dfea.zip