aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test/java/org
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2014-07-31 12:55:00 -0700
committerXiangrui Meng <meng@databricks.com>2014-07-31 12:55:00 -0700
commitdc0865bc7e119fe507061c27069c17523b87dfea (patch)
tree481dfc65f65273dda1fbfae7e22c780aee7f7168 /mllib/src/test/java/org
parente5749a1342327263dc6b94ba470e392fbea703fa (diff)
downloadspark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.gz
spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.bz2
spark-dc0865bc7e119fe507061c27069c17523b87dfea.zip
[SPARK-2511][MLLIB] add HashingTF and IDF
This is roughly the TF-IDF implementation used in the Databricks Cloud Demo: http://databricks.com/cloud/ . Both `HashingTF` and `IDF` are implemented as transformers, similar to scikit-learn. Author: Xiangrui Meng <meng@databricks.com> Closes #1671 from mengxr/tfidf and squashes the following commits: 7d65888 [Xiangrui Meng] use JavaConverters._ 5fe9ec4 [Xiangrui Meng] fix unit test 6e214ec [Xiangrui Meng] add apache header cfd9aed [Xiangrui Meng] add Java-friendly methods move classes to mllib.feature 3814440 [Xiangrui Meng] add HashingTF and IDF
Diffstat (limited to 'mllib/src/test/java/org')
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java66
1 files changed, 66 insertions, 0 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
new file mode 100644
index 0000000000..e8d99f4ae4
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+
+public class JavaTfIdfSuite implements Serializable {
+ private transient JavaSparkContext sc;
+
+ @Before
+ public void setUp() {
+ sc = new JavaSparkContext("local", "JavaTfIdfSuite");
+ }
+
+ @After
+ public void tearDown() {
+ sc.stop();
+ sc = null;
+ }
+
+ @Test
+ public void tfIdf() {
+ // The tests are to check Java compatibility.
+ HashingTF tf = new HashingTF();
+ JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
+ Lists.newArrayList("this is a sentence".split(" ")),
+ Lists.newArrayList("this is another sentence".split(" ")),
+ Lists.newArrayList("this is still a sentence".split(" "))), 2);
+ JavaRDD<Vector> termFreqs = tf.transform(documents);
+ termFreqs.collect();
+ IDF idf = new IDF();
+ JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
+ List<Vector> localTfIdfs = tfIdfs.collect();
+ int indexOfThis = tf.indexOf("this");
+ for (Vector v: localTfIdfs) {
+ Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
+ }
+ }
+}