diff options
Diffstat (limited to 'mllib')
7 files changed, 42 insertions, 79 deletions
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java index 8c0338e284..683ceffeae 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java @@ -21,16 +21,14 @@ import java.io.Serializable; import java.util.Arrays; import java.util.List; -import scala.Tuple2; - import org.junit.Assert; import org.junit.Test; import org.apache.spark.SharedSparkSession; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.mllib.linalg.DenseVector; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.distributed.RowMatrix; import org.apache.spark.sql.Dataset; @@ -69,35 +67,22 @@ public class JavaPCASuite extends SharedSparkSession { JavaRDD<Vector> dataRDD = jsc.parallelize(points, 2); RowMatrix mat = new RowMatrix(dataRDD.map( - new Function<Vector, org.apache.spark.mllib.linalg.Vector>() { - public org.apache.spark.mllib.linalg.Vector call(Vector vector) { - return new org.apache.spark.mllib.linalg.DenseVector(vector.toArray()); - } - } + (Vector vector) -> (org.apache.spark.mllib.linalg.Vector) new DenseVector(vector.toArray()) ).rdd()); Matrix pc = mat.computePrincipalComponents(3); mat.multiply(pc).rows().toJavaRDD(); - JavaRDD<Vector> expected = mat.multiply(pc).rows().toJavaRDD().map( - new Function<org.apache.spark.mllib.linalg.Vector, Vector>() { - public Vector call(org.apache.spark.mllib.linalg.Vector vector) { - return vector.asML(); - } - } - ); + JavaRDD<Vector> expected = mat.multiply(pc).rows().toJavaRDD() + .map(org.apache.spark.mllib.linalg.Vector::asML); - JavaRDD<VectorPair> featuresExpected = dataRDD.zip(expected).map( - new Function<Tuple2<Vector, Vector>, VectorPair>() { - public VectorPair call(Tuple2<Vector, Vector> pair) { - VectorPair featuresExpected = new VectorPair(); - featuresExpected.setFeatures(pair._1()); - featuresExpected.setExpected(pair._2()); - return featuresExpected; - } - } - ); + JavaRDD<VectorPair> featuresExpected = dataRDD.zip(expected).map(pair -> { + VectorPair featuresExpected1 = new VectorPair(); + featuresExpected1.setFeatures(pair._1()); + featuresExpected1.setExpected(pair._2()); + return featuresExpected1; + }); Dataset<Row> df = spark.createDataFrame(featuresExpected, VectorPair.class); PCAModel pca = new PCA() diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java index 6ded42e928..65db3d014f 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java @@ -25,7 +25,6 @@ import org.junit.Test; import org.apache.spark.SharedSparkSession; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; @@ -42,7 +41,7 @@ public class JavaNaiveBayesSuite extends SharedSparkSession { new LabeledPoint(2, Vectors.dense(0.0, 0.0, 2.0)) ); - private int validatePrediction(List<LabeledPoint> points, NaiveBayesModel model) { + private static int validatePrediction(List<LabeledPoint> points, NaiveBayesModel model) { int correct = 0; for (LabeledPoint p : points) { if (model.predict(p.features()) == p.label()) { @@ -80,12 +79,7 @@ public class JavaNaiveBayesSuite extends SharedSparkSession { public void testPredictJavaRDD() { JavaRDD<LabeledPoint> examples = jsc.parallelize(POINTS, 2).cache(); NaiveBayesModel model = NaiveBayes.train(examples.rdd()); - JavaRDD<Vector> vectors = examples.map(new Function<LabeledPoint, Vector>() { - @Override - public Vector call(LabeledPoint v) throws Exception { - return v.features(); - } - }); + JavaRDD<Vector> vectors = examples.map(LabeledPoint::features); JavaRDD<Double> predictions = model.predict(vectors); // Should be able to get the first prediction. predictions.first(); diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java index 3d62b273d2..b4196c6ecd 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java @@ -17,7 +17,7 @@ package org.apache.spark.mllib.clustering; -import com.google.common.collect.Lists; +import java.util.Arrays; import org.junit.Assert; import org.junit.Test; @@ -31,7 +31,7 @@ public class JavaBisectingKMeansSuite extends SharedSparkSession { @Test public void twoDimensionalData() { - JavaRDD<Vector> points = jsc.parallelize(Lists.newArrayList( + JavaRDD<Vector> points = jsc.parallelize(Arrays.asList( Vectors.dense(4, -1), Vectors.dense(4, 1), Vectors.sparse(2, new int[]{0}, new double[]{1.0}) diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java index 08d6713ab2..38ee2507f2 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java @@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import scala.Tuple2; import scala.Tuple3; @@ -30,7 +31,6 @@ import static org.junit.Assert.*; import org.apache.spark.SharedSparkSession; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; @@ -39,7 +39,7 @@ public class JavaLDASuite extends SharedSparkSession { @Override public void setUp() throws IOException { super.setUp(); - ArrayList<Tuple2<Long, Vector>> tinyCorpus = new ArrayList<>(); + List<Tuple2<Long, Vector>> tinyCorpus = new ArrayList<>(); for (int i = 0; i < LDASuite.tinyCorpus().length; i++) { tinyCorpus.add(new Tuple2<>((Long) LDASuite.tinyCorpus()[i]._1(), LDASuite.tinyCorpus()[i]._2())); @@ -53,7 +53,7 @@ public class JavaLDASuite extends SharedSparkSession { Matrix topics = LDASuite.tinyTopics(); double[] topicConcentration = new double[topics.numRows()]; Arrays.fill(topicConcentration, 1.0D / topics.numRows()); - LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D); + LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1.0, 100.0); // Check: basic parameters assertEquals(model.k(), tinyK); @@ -87,17 +87,17 @@ public class JavaLDASuite extends SharedSparkSession { // Check: basic parameters LocalLDAModel localModel = model.toLocal(); - assertEquals(model.k(), k); - assertEquals(localModel.k(), k); - assertEquals(model.vocabSize(), tinyVocabSize); - assertEquals(localModel.vocabSize(), tinyVocabSize); - assertEquals(model.topicsMatrix(), localModel.topicsMatrix()); + assertEquals(k, model.k()); + assertEquals(k, localModel.k()); + assertEquals(tinyVocabSize, model.vocabSize()); + assertEquals(tinyVocabSize, localModel.vocabSize()); + assertEquals(localModel.topicsMatrix(), model.topicsMatrix()); // Check: topic summaries Tuple2<int[], double[]>[] roundedTopicSummary = model.describeTopics(); - assertEquals(roundedTopicSummary.length, k); + assertEquals(k, roundedTopicSummary.length); Tuple2<int[], double[]>[] roundedLocalTopicSummary = localModel.describeTopics(); - assertEquals(roundedLocalTopicSummary.length, k); + assertEquals(k, roundedLocalTopicSummary.length); // Check: log probabilities assertTrue(model.logLikelihood() < 0.0); @@ -107,12 +107,8 @@ public class JavaLDASuite extends SharedSparkSession { JavaPairRDD<Long, Vector> topicDistributions = model.javaTopicDistributions(); // SPARK-5562. since the topicDistribution returns the distribution of the non empty docs // over topics. Compare it against nonEmptyCorpus instead of corpus - JavaPairRDD<Long, Vector> nonEmptyCorpus = corpus.filter( - new Function<Tuple2<Long, Vector>, Boolean>() { - public Boolean call(Tuple2<Long, Vector> tuple2) { - return Vectors.norm(tuple2._2(), 1.0) != 0.0; - } - }); + JavaPairRDD<Long, Vector> nonEmptyCorpus = + corpus.filter(tuple2 -> Vectors.norm(tuple2._2(), 1.0) != 0.0); assertEquals(topicDistributions.count(), nonEmptyCorpus.count()); // Check: javaTopTopicsPerDocuments @@ -155,14 +151,14 @@ public class JavaLDASuite extends SharedSparkSession { LDAModel model = lda.run(corpus); // Check: basic parameters - assertEquals(model.k(), k); - assertEquals(model.vocabSize(), tinyVocabSize); + assertEquals(k, model.k()); + assertEquals(tinyVocabSize, model.vocabSize()); // Check: topic summaries Tuple2<int[], double[]>[] roundedTopicSummary = model.describeTopics(); - assertEquals(roundedTopicSummary.length, k); + assertEquals(k, roundedTopicSummary.length); Tuple2<int[], double[]>[] roundedLocalTopicSummary = model.describeTopics(); - assertEquals(roundedLocalTopicSummary.length, k); + assertEquals(k, roundedLocalTopicSummary.length); } @Test @@ -177,7 +173,7 @@ public class JavaLDASuite extends SharedSparkSession { double logPerplexity = toyModel.logPerplexity(pairedDocs); // check: logLikelihood. - ArrayList<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<>(); + List<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<>(); docsSingleWord.add(new Tuple2<>(0L, Vectors.dense(1.0, 0.0, 0.0))); JavaPairRDD<Long, Vector> single = JavaPairRDD.fromJavaRDD(jsc.parallelize(docsSingleWord)); double logLikelihood = toyModel.logLikelihood(single); @@ -190,6 +186,6 @@ public class JavaLDASuite extends SharedSparkSession { LDASuite.tinyTopicDescription(); private JavaPairRDD<Long, Vector> corpus; private LocalLDAModel toyModel = LDASuite.toyModel(); - private ArrayList<Tuple2<Long, Vector>> toyData = LDASuite.javaToyData(); + private List<Tuple2<Long, Vector>> toyData = LDASuite.javaToyData(); } diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java index 3451e07737..15de566c88 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java @@ -31,9 +31,9 @@ public class JavaAssociationRulesSuite extends SharedSparkSession { @SuppressWarnings("unchecked") JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = jsc.parallelize(Arrays.asList( - new FreqItemset<String>(new String[]{"a"}, 15L), - new FreqItemset<String>(new String[]{"b"}, 35L), - new FreqItemset<String>(new String[]{"a", "b"}, 12L) + new FreqItemset<>(new String[]{"a"}, 15L), + new FreqItemset<>(new String[]{"b"}, 35L), + new FreqItemset<>(new String[]{"a", "b"}, 12L) )); JavaRDD<AssociationRules.Rule<String>> results = (new AssociationRules()).run(freqItemsets); diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java index a46b1321b3..86c723aa00 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java @@ -24,13 +24,13 @@ import org.junit.Test; import org.apache.spark.SharedSparkSession; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.util.LinearDataGenerator; public class JavaLinearRegressionSuite extends SharedSparkSession { - int validatePrediction(List<LabeledPoint> validationData, LinearRegressionModel model) { + private static int validatePrediction( + List<LabeledPoint> validationData, LinearRegressionModel model) { int numAccurate = 0; for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); @@ -87,12 +87,7 @@ public class JavaLinearRegressionSuite extends SharedSparkSession { LinearDataGenerator.generateLinearInputAsList(A, weights, nPoints, 42, 0.1), 2).cache(); LinearRegressionWithSGD linSGDImpl = new LinearRegressionWithSGD(); LinearRegressionModel model = linSGDImpl.run(testRDD.rdd()); - JavaRDD<Vector> vectors = testRDD.map(new Function<LabeledPoint, Vector>() { - @Override - public Vector call(LabeledPoint v) throws Exception { - return v.features(); - } - }); + JavaRDD<Vector> vectors = testRDD.map(LabeledPoint::features); JavaRDD<Double> predictions = model.predict(vectors); // Should be able to get the first prediction. predictions.first(); diff --git a/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java b/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java index 1dcbbcaa02..0f71deb9ea 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java @@ -25,8 +25,6 @@ import org.junit.Test; import org.apache.spark.SharedSparkSession; import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.tree.configuration.Algo; import org.apache.spark.mllib.tree.configuration.Strategy; @@ -35,7 +33,7 @@ import org.apache.spark.mllib.tree.model.DecisionTreeModel; public class JavaDecisionTreeSuite extends SharedSparkSession { - int validatePrediction(List<LabeledPoint> validationData, DecisionTreeModel model) { + private static int validatePrediction(List<LabeledPoint> validationData, DecisionTreeModel model) { int numCorrect = 0; for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); @@ -63,7 +61,7 @@ public class JavaDecisionTreeSuite extends SharedSparkSession { DecisionTreeModel model = learner.run(rdd.rdd()); int numCorrect = validatePrediction(arr, model); - Assert.assertTrue(numCorrect == rdd.count()); + Assert.assertEquals(numCorrect, rdd.count()); } @Test @@ -82,15 +80,10 @@ public class JavaDecisionTreeSuite extends SharedSparkSession { DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy); // java compatibility test - JavaRDD<Double> predictions = model.predict(rdd.map(new Function<LabeledPoint, Vector>() { - @Override - public Vector call(LabeledPoint v1) { - return v1.features(); - } - })); + JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features)); int numCorrect = validatePrediction(arr, model); - Assert.assertTrue(numCorrect == rdd.count()); + Assert.assertEquals(numCorrect, rdd.count()); } } |