[SPARK-2852][MLLIB] API consistency for `mllib.feature`

This is part of SPARK-2828: 1. added a Java-friendly fit method to Word2Vec with tests 2. change DeveloperApi to Experimental for Normalizer & StandardScaler 3. change default feature dimension to 2^20 in HashingTF Author: Xiangrui Meng <meng@databricks.com> Closes #1807 from mengxr/feature-api-check and squashes the following commits: 773c1a9 [Xiangrui Meng] change default numFeatures to 2^20 in HashingTF change annotation from DeveloperApi to Experimental in Normalizer and StandardScaler 883e122 [Xiangrui Meng] add @Experimental to Word2VecModel add a Java-friendly method to Word2Vec.fit with tests
author: Xiangrui Meng <meng@databricks.com> 2014-08-06 14:07:51 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-08-06 14:07:51 -0700
commit: 25cff1019da9d6cfc486a31d035b372ea5fbdfd2 (patch)
tree: 2336b0899ff33ad0187442c6098b089587f5047b /mllib/src/test/java/org
parent: 4e982364426c7d65032e8006c63ca4f9a0d40470 (diff)
download: spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.tar.gz
spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.tar.bz2
spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.zip
1 files changed, 66 insertions, 0 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java
new file mode 100644
index 0000000000..fb7afe8c64
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaWord2VecSuite.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.feature;
+
+import java.io.Serializable;
+import java.util.List;
+
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import com.google.common.base.Strings;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class JavaWord2VecSuite implements Serializable {
+  private transient JavaSparkContext sc;
+
+  @Before
+  public void setUp() {
+    sc = new JavaSparkContext("local", "JavaWord2VecSuite");
+  }
+
+  @After
+  public void tearDown() {
+    sc.stop();
+    sc = null;
+  }
+
+  @Test
+  @SuppressWarnings("unchecked")
+  public void word2Vec() {
+    // The tests are to check Java compatibility.
+    String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10);
+    List<String> words = Lists.newArrayList(sentence.split(" "));
+    List<List<String>> localDoc = Lists.newArrayList(words, words);
+    JavaRDD<List<String>> doc = sc.parallelize(localDoc);
+    Word2Vec word2vec = new Word2Vec()
+      .setVectorSize(10)
+      .setSeed(42L);
+    Word2VecModel model = word2vec.fit(doc);
+    Tuple2<String, Object>[] syms = model.findSynonyms("a", 2);
+    Assert.assertEquals(2, syms.length);
+    Assert.assertEquals("b", syms[0]._1());
+    Assert.assertEquals("c", syms[1]._1());
+  }
+}
author	Xiangrui Meng <meng@databricks.com>	2014-08-06 14:07:51 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-08-06 14:07:51 -0700
commit	25cff1019da9d6cfc486a31d035b372ea5fbdfd2 (patch)
tree	2336b0899ff33ad0187442c6098b089587f5047b /mllib/src/test/java/org
parent	4e982364426c7d65032e8006c63ca4f9a0d40470 (diff)
download	spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.tar.gz spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.tar.bz2 spark-25cff1019da9d6cfc486a31d035b372ea5fbdfd2.zip