aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-08-14 10:25:11 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-08-14 10:25:11 -0700
commita0e1abbd010b9e73d472ce12ff1d987678005d32 (patch)
treec439c25caad95561001eacb76b08950051332ab4 /mllib
parentc8677d73666850b37ff937520e538650632ce304 (diff)
downloadspark-a0e1abbd010b9e73d472ce12ff1d987678005d32.tar.gz
spark-a0e1abbd010b9e73d472ce12ff1d987678005d32.tar.bz2
spark-a0e1abbd010b9e73d472ce12ff1d987678005d32.zip
[SPARK-9661] [MLLIB] minor clean-up of SPARK-9661
Some minor clean-ups after SPARK-9661. See my inline comments. MechCoder jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #8190 from mengxr/SPARK-9661-fix.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala6
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java40
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala2
4 files changed, 28 insertions, 25 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index f31949f13a..82f05e4a18 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -674,10 +674,9 @@ class DistributedLDAModel private[clustering] (
}
/** Java-friendly version of [[topTopicsPerDocument]] */
- def javaTopTopicsPerDocument(
- k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] = {
+ def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = {
val topics = topTopicsPerDocument(k)
- topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[java.lang.Double])]].toJavaRDD()
+ topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD()
}
// TODO:
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 24fe48cb8f..ef8d786070 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -221,9 +221,7 @@ object Statistics {
def kolmogorovSmirnovTest(
data: JavaDoubleRDD,
distName: String,
- params: java.lang.Double*): KolmogorovSmirnovTestResult = {
- val javaParams = params.asInstanceOf[Seq[Double]]
- KolmogorovSmirnovTest.testOneSample(data.rdd.asInstanceOf[RDD[Double]],
- distName, javaParams: _*)
+ params: Double*): KolmogorovSmirnovTestResult = {
+ kolmogorovSmirnovTest(data.rdd.asInstanceOf[RDD[Double]], distName, params: _*)
}
}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 427be9430d..6e91cde2ea 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -22,12 +22,14 @@ import java.util.ArrayList;
import java.util.Arrays;
import scala.Tuple2;
+import scala.Tuple3;
import org.junit.After;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertArrayEquals;
import org.junit.Before;
import org.junit.Test;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.JavaPairRDD;
@@ -44,9 +46,9 @@ public class JavaLDASuite implements Serializable {
public void setUp() {
sc = new JavaSparkContext("local", "JavaLDA");
ArrayList<Tuple2<Long, Vector>> tinyCorpus = new ArrayList<Tuple2<Long, Vector>>();
- for (int i = 0; i < LDASuite$.MODULE$.tinyCorpus().length; i++) {
- tinyCorpus.add(new Tuple2<Long, Vector>((Long)LDASuite$.MODULE$.tinyCorpus()[i]._1(),
- LDASuite$.MODULE$.tinyCorpus()[i]._2()));
+ for (int i = 0; i < LDASuite.tinyCorpus().length; i++) {
+ tinyCorpus.add(new Tuple2<Long, Vector>((Long)LDASuite.tinyCorpus()[i]._1(),
+ LDASuite.tinyCorpus()[i]._2()));
}
JavaRDD<Tuple2<Long, Vector>> tmpCorpus = sc.parallelize(tinyCorpus, 2);
corpus = JavaPairRDD.fromJavaRDD(tmpCorpus);
@@ -60,7 +62,7 @@ public class JavaLDASuite implements Serializable {
@Test
public void localLDAModel() {
- Matrix topics = LDASuite$.MODULE$.tinyTopics();
+ Matrix topics = LDASuite.tinyTopics();
double[] topicConcentration = new double[topics.numRows()];
Arrays.fill(topicConcentration, 1.0D / topics.numRows());
LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D);
@@ -110,8 +112,8 @@ public class JavaLDASuite implements Serializable {
assertEquals(roundedLocalTopicSummary.length, k);
// Check: log probabilities
- assert(model.logLikelihood() < 0.0);
- assert(model.logPrior() < 0.0);
+ assertTrue(model.logLikelihood() < 0.0);
+ assertTrue(model.logPrior() < 0.0);
// Check: topic distributions
JavaPairRDD<Long, Vector> topicDistributions = model.javaTopicDistributions();
@@ -126,8 +128,12 @@ public class JavaLDASuite implements Serializable {
assertEquals(topicDistributions.count(), nonEmptyCorpus.count());
// Check: javaTopTopicsPerDocuments
- JavaRDD<scala.Tuple3<java.lang.Long, int[], java.lang.Double[]>> topTopics =
- model.javaTopTopicsPerDocument(3);
+ Tuple3<Long, int[], double[]> topTopics = model.javaTopTopicsPerDocument(3).first();
+ Long docId = topTopics._1(); // confirm doc ID type
+ int[] topicIndices = topTopics._2();
+ double[] topicWeights = topTopics._3();
+ assertEquals(3, topicIndices.length);
+ assertEquals(3, topicWeights.length);
}
@Test
@@ -177,18 +183,18 @@ public class JavaLDASuite implements Serializable {
// check: logLikelihood.
ArrayList<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<Tuple2<Long, Vector>>();
- docsSingleWord.add(new Tuple2<Long, Vector>(Long.valueOf(0), Vectors.dense(1.0, 0.0, 0.0)));
+ docsSingleWord.add(new Tuple2<Long, Vector>(0L, Vectors.dense(1.0, 0.0, 0.0)));
JavaPairRDD<Long, Vector> single = JavaPairRDD.fromJavaRDD(sc.parallelize(docsSingleWord));
double logLikelihood = toyModel.logLikelihood(single);
}
- private static int tinyK = LDASuite$.MODULE$.tinyK();
- private static int tinyVocabSize = LDASuite$.MODULE$.tinyVocabSize();
- private static Matrix tinyTopics = LDASuite$.MODULE$.tinyTopics();
+ private static int tinyK = LDASuite.tinyK();
+ private static int tinyVocabSize = LDASuite.tinyVocabSize();
+ private static Matrix tinyTopics = LDASuite.tinyTopics();
private static Tuple2<int[], double[]>[] tinyTopicDescription =
- LDASuite$.MODULE$.tinyTopicDescription();
+ LDASuite.tinyTopicDescription();
private JavaPairRDD<Long, Vector> corpus;
- private LocalLDAModel toyModel = LDASuite$.MODULE$.toyModel();
- private ArrayList<Tuple2<Long, Vector>> toyData = LDASuite$.MODULE$.javaToyData();
+ private LocalLDAModel toyModel = LDASuite.toyModel();
+ private ArrayList<Tuple2<Long, Vector>> toyData = LDASuite.javaToyData();
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 926185e90b..99e28499fd 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -581,7 +581,7 @@ private[clustering] object LDASuite {
def javaToyData: JArrayList[(java.lang.Long, Vector)] = {
val javaData = new JArrayList[(java.lang.Long, Vector)]
var i = 0
- while (i < toyData.size) {
+ while (i < toyData.length) {
javaData.add((toyData(i)._1, toyData(i)._2))
i += 1
}