diff options
author | Xiangrui Meng <meng@databricks.com> | 2014-07-31 12:55:00 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-07-31 12:55:00 -0700 |
commit | dc0865bc7e119fe507061c27069c17523b87dfea (patch) | |
tree | 481dfc65f65273dda1fbfae7e22c780aee7f7168 /mllib/src/test/java | |
parent | e5749a1342327263dc6b94ba470e392fbea703fa (diff) | |
download | spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.gz spark-dc0865bc7e119fe507061c27069c17523b87dfea.tar.bz2 spark-dc0865bc7e119fe507061c27069c17523b87dfea.zip |
[SPARK-2511][MLLIB] add HashingTF and IDF
This is roughly the TF-IDF implementation used in the Databricks Cloud Demo: http://databricks.com/cloud/ .
Both `HashingTF` and `IDF` are implemented as transformers, similar to scikit-learn.
Author: Xiangrui Meng <meng@databricks.com>
Closes #1671 from mengxr/tfidf and squashes the following commits:
7d65888 [Xiangrui Meng] use JavaConverters._
5fe9ec4 [Xiangrui Meng] fix unit test
6e214ec [Xiangrui Meng] add apache header
cfd9aed [Xiangrui Meng] add Java-friendly methods move classes to mllib.feature
3814440 [Xiangrui Meng] add HashingTF and IDF
Diffstat (limited to 'mllib/src/test/java')
-rw-r--r-- | mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java new file mode 100644 index 0000000000..e8d99f4ae4 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.feature; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import com.google.common.collect.Lists; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.linalg.Vector; + +public class JavaTfIdfSuite implements Serializable { + private transient JavaSparkContext sc; + + @Before + public void setUp() { + sc = new JavaSparkContext("local", "JavaTfIdfSuite"); + } + + @After + public void tearDown() { + sc.stop(); + sc = null; + } + + @Test + public void tfIdf() { + // The tests are to check Java compatibility. + HashingTF tf = new HashingTF(); + JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList( + Lists.newArrayList("this is a sentence".split(" ")), + Lists.newArrayList("this is another sentence".split(" ")), + Lists.newArrayList("this is still a sentence".split(" "))), 2); + JavaRDD<Vector> termFreqs = tf.transform(documents); + termFreqs.collect(); + IDF idf = new IDF(); + JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs); + List<Vector> localTfIdfs = tfIdfs.collect(); + int indexOfThis = tf.indexOf("this"); + for (Vector v: localTfIdfs) { + Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15); + } + } +} |