diff options
author | Xiangrui Meng <meng@databricks.com> | 2014-08-06 14:07:51 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-08-06 14:07:51 -0700 |
commit | 25cff1019da9d6cfc486a31d035b372ea5fbdfd2 (patch) | |
tree | 2336b0899ff33ad0187442c6098b089587f5047b /mllib/src/test/java/org | |
parent | 4e982364426c7d65032e8006c63ca4f9a0d40470 (diff) | |
download | spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.tar.gz spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.tar.bz2 spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.zip |
[SPARK-2852][MLLIB] API consistency for `mllib.feature`
This is part of SPARK-2828:
1. added a Java-friendly fit method to Word2Vec with tests
2. change DeveloperApi to Experimental for Normalizer & StandardScaler
3. change default feature dimension to 2^20 in HashingTF
Author: Xiangrui Meng <meng@databricks.com>
Closes #1807 from mengxr/feature-api-check and squashes the following commits:
773c1a9 [Xiangrui Meng] change default numFeatures to 2^20 in HashingTF change annotation from DeveloperApi to Experimental in Normalizer and StandardScaler
883e122 [Xiangrui Meng] add @Experimental to Word2VecModel add a Java-friendly method to Word2Vec.fit with tests
Diffstat (limited to 'mllib/src/test/java/org')
-rw-r--r-- | mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java new file mode 100644 index 0000000000..fb7afe8c64 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.feature; + +import java.io.Serializable; +import java.util.List; + +import scala.Tuple2; + +import com.google.common.collect.Lists; +import com.google.common.base.Strings; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +public class JavaWord2VecSuite implements Serializable { + private transient JavaSparkContext sc; + + @Before + public void setUp() { + sc = new JavaSparkContext("local", "JavaWord2VecSuite"); + } + + @After + public void tearDown() { + sc.stop(); + sc = null; + } + + @Test + @SuppressWarnings("unchecked") + public void word2Vec() { + // The tests are to check Java compatibility. + String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10); + List<String> words = Lists.newArrayList(sentence.split(" ")); + List<List<String>> localDoc = Lists.newArrayList(words, words); + JavaRDD<List<String>> doc = sc.parallelize(localDoc); + Word2Vec word2vec = new Word2Vec() + .setVectorSize(10) + .setSeed(42L); + Word2VecModel model = word2vec.fit(doc); + Tuple2<String, Object>[] syms = model.findSynonyms("a", 2); + Assert.assertEquals(2, syms.length); + Assert.assertEquals("b", syms[0]._1()); + Assert.assertEquals("c", syms[1]._1()); + } +} |