aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2017-02-19 09:42:50 -0800
committerSean Owen <sowen@cloudera.com>2017-02-19 09:42:50 -0800
commit1487c9af20a333ead55955acf4c0aa323bea0d07 (patch)
tree5f47daa77e0f73da1e009cc3dcf0a5c0073246aa /mllib
parentde14d35f77071932963a994fac5aec0e5df838a1 (diff)
downloadspark-1487c9af20a333ead55955acf4c0aa323bea0d07.tar.gz
spark-1487c9af20a333ead55955acf4c0aa323bea0d07.tar.bz2
spark-1487c9af20a333ead55955acf4c0aa323bea0d07.zip
[SPARK-19534][TESTS] Convert Java tests to use lambdas, Java 8 features
## What changes were proposed in this pull request? Convert tests to use Java 8 lambdas, and modest related fixes to surrounding code. ## How was this patch tested? Jenkins tests Author: Sean Owen <sowen@cloudera.com> Closes #16964 from srowen/SPARK-19534.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java35
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java10
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java4
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java40
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java6
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java11
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java15
7 files changed, 42 insertions, 79 deletions
diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
index 8c0338e284..683ceffeae 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java
@@ -21,16 +21,14 @@ import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
-import scala.Tuple2;
-
import org.junit.Assert;
import org.junit.Test;
import org.apache.spark.SharedSparkSession;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
import org.apache.spark.ml.linalg.Vector;
import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.mllib.linalg.DenseVector;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.distributed.RowMatrix;
import org.apache.spark.sql.Dataset;
@@ -69,35 +67,22 @@ public class JavaPCASuite extends SharedSparkSession {
JavaRDD<Vector> dataRDD = jsc.parallelize(points, 2);
RowMatrix mat = new RowMatrix(dataRDD.map(
- new Function<Vector, org.apache.spark.mllib.linalg.Vector>() {
- public org.apache.spark.mllib.linalg.Vector call(Vector vector) {
- return new org.apache.spark.mllib.linalg.DenseVector(vector.toArray());
- }
- }
+ (Vector vector) -> (org.apache.spark.mllib.linalg.Vector) new DenseVector(vector.toArray())
).rdd());
Matrix pc = mat.computePrincipalComponents(3);
mat.multiply(pc).rows().toJavaRDD();
- JavaRDD<Vector> expected = mat.multiply(pc).rows().toJavaRDD().map(
- new Function<org.apache.spark.mllib.linalg.Vector, Vector>() {
- public Vector call(org.apache.spark.mllib.linalg.Vector vector) {
- return vector.asML();
- }
- }
- );
+ JavaRDD<Vector> expected = mat.multiply(pc).rows().toJavaRDD()
+ .map(org.apache.spark.mllib.linalg.Vector::asML);
- JavaRDD<VectorPair> featuresExpected = dataRDD.zip(expected).map(
- new Function<Tuple2<Vector, Vector>, VectorPair>() {
- public VectorPair call(Tuple2<Vector, Vector> pair) {
- VectorPair featuresExpected = new VectorPair();
- featuresExpected.setFeatures(pair._1());
- featuresExpected.setExpected(pair._2());
- return featuresExpected;
- }
- }
- );
+ JavaRDD<VectorPair> featuresExpected = dataRDD.zip(expected).map(pair -> {
+ VectorPair featuresExpected1 = new VectorPair();
+ featuresExpected1.setFeatures(pair._1());
+ featuresExpected1.setExpected(pair._2());
+ return featuresExpected1;
+ });
Dataset<Row> df = spark.createDataFrame(featuresExpected, VectorPair.class);
PCAModel pca = new PCA()
diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
index 6ded42e928..65db3d014f 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
@@ -25,7 +25,6 @@ import org.junit.Test;
import org.apache.spark.SharedSparkSession;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.regression.LabeledPoint;
@@ -42,7 +41,7 @@ public class JavaNaiveBayesSuite extends SharedSparkSession {
new LabeledPoint(2, Vectors.dense(0.0, 0.0, 2.0))
);
- private int validatePrediction(List<LabeledPoint> points, NaiveBayesModel model) {
+ private static int validatePrediction(List<LabeledPoint> points, NaiveBayesModel model) {
int correct = 0;
for (LabeledPoint p : points) {
if (model.predict(p.features()) == p.label()) {
@@ -80,12 +79,7 @@ public class JavaNaiveBayesSuite extends SharedSparkSession {
public void testPredictJavaRDD() {
JavaRDD<LabeledPoint> examples = jsc.parallelize(POINTS, 2).cache();
NaiveBayesModel model = NaiveBayes.train(examples.rdd());
- JavaRDD<Vector> vectors = examples.map(new Function<LabeledPoint, Vector>() {
- @Override
- public Vector call(LabeledPoint v) throws Exception {
- return v.features();
- }
- });
+ JavaRDD<Vector> vectors = examples.map(LabeledPoint::features);
JavaRDD<Double> predictions = model.predict(vectors);
// Should be able to get the first prediction.
predictions.first();
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
index 3d62b273d2..b4196c6ecd 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaBisectingKMeansSuite.java
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.clustering;
-import com.google.common.collect.Lists;
+import java.util.Arrays;
import org.junit.Assert;
import org.junit.Test;
@@ -31,7 +31,7 @@ public class JavaBisectingKMeansSuite extends SharedSparkSession {
@Test
public void twoDimensionalData() {
- JavaRDD<Vector> points = jsc.parallelize(Lists.newArrayList(
+ JavaRDD<Vector> points = jsc.parallelize(Arrays.asList(
Vectors.dense(4, -1),
Vectors.dense(4, 1),
Vectors.sparse(2, new int[]{0}, new double[]{1.0})
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 08d6713ab2..38ee2507f2 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.List;
import scala.Tuple2;
import scala.Tuple3;
@@ -30,7 +31,6 @@ import static org.junit.Assert.*;
import org.apache.spark.SharedSparkSession;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
@@ -39,7 +39,7 @@ public class JavaLDASuite extends SharedSparkSession {
@Override
public void setUp() throws IOException {
super.setUp();
- ArrayList<Tuple2<Long, Vector>> tinyCorpus = new ArrayList<>();
+ List<Tuple2<Long, Vector>> tinyCorpus = new ArrayList<>();
for (int i = 0; i < LDASuite.tinyCorpus().length; i++) {
tinyCorpus.add(new Tuple2<>((Long) LDASuite.tinyCorpus()[i]._1(),
LDASuite.tinyCorpus()[i]._2()));
@@ -53,7 +53,7 @@ public class JavaLDASuite extends SharedSparkSession {
Matrix topics = LDASuite.tinyTopics();
double[] topicConcentration = new double[topics.numRows()];
Arrays.fill(topicConcentration, 1.0D / topics.numRows());
- LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1D, 100D);
+ LocalLDAModel model = new LocalLDAModel(topics, Vectors.dense(topicConcentration), 1.0, 100.0);
// Check: basic parameters
assertEquals(model.k(), tinyK);
@@ -87,17 +87,17 @@ public class JavaLDASuite extends SharedSparkSession {
// Check: basic parameters
LocalLDAModel localModel = model.toLocal();
- assertEquals(model.k(), k);
- assertEquals(localModel.k(), k);
- assertEquals(model.vocabSize(), tinyVocabSize);
- assertEquals(localModel.vocabSize(), tinyVocabSize);
- assertEquals(model.topicsMatrix(), localModel.topicsMatrix());
+ assertEquals(k, model.k());
+ assertEquals(k, localModel.k());
+ assertEquals(tinyVocabSize, model.vocabSize());
+ assertEquals(tinyVocabSize, localModel.vocabSize());
+ assertEquals(localModel.topicsMatrix(), model.topicsMatrix());
// Check: topic summaries
Tuple2<int[], double[]>[] roundedTopicSummary = model.describeTopics();
- assertEquals(roundedTopicSummary.length, k);
+ assertEquals(k, roundedTopicSummary.length);
Tuple2<int[], double[]>[] roundedLocalTopicSummary = localModel.describeTopics();
- assertEquals(roundedLocalTopicSummary.length, k);
+ assertEquals(k, roundedLocalTopicSummary.length);
// Check: log probabilities
assertTrue(model.logLikelihood() < 0.0);
@@ -107,12 +107,8 @@ public class JavaLDASuite extends SharedSparkSession {
JavaPairRDD<Long, Vector> topicDistributions = model.javaTopicDistributions();
// SPARK-5562. since the topicDistribution returns the distribution of the non empty docs
// over topics. Compare it against nonEmptyCorpus instead of corpus
- JavaPairRDD<Long, Vector> nonEmptyCorpus = corpus.filter(
- new Function<Tuple2<Long, Vector>, Boolean>() {
- public Boolean call(Tuple2<Long, Vector> tuple2) {
- return Vectors.norm(tuple2._2(), 1.0) != 0.0;
- }
- });
+ JavaPairRDD<Long, Vector> nonEmptyCorpus =
+ corpus.filter(tuple2 -> Vectors.norm(tuple2._2(), 1.0) != 0.0);
assertEquals(topicDistributions.count(), nonEmptyCorpus.count());
// Check: javaTopTopicsPerDocuments
@@ -155,14 +151,14 @@ public class JavaLDASuite extends SharedSparkSession {
LDAModel model = lda.run(corpus);
// Check: basic parameters
- assertEquals(model.k(), k);
- assertEquals(model.vocabSize(), tinyVocabSize);
+ assertEquals(k, model.k());
+ assertEquals(tinyVocabSize, model.vocabSize());
// Check: topic summaries
Tuple2<int[], double[]>[] roundedTopicSummary = model.describeTopics();
- assertEquals(roundedTopicSummary.length, k);
+ assertEquals(k, roundedTopicSummary.length);
Tuple2<int[], double[]>[] roundedLocalTopicSummary = model.describeTopics();
- assertEquals(roundedLocalTopicSummary.length, k);
+ assertEquals(k, roundedLocalTopicSummary.length);
}
@Test
@@ -177,7 +173,7 @@ public class JavaLDASuite extends SharedSparkSession {
double logPerplexity = toyModel.logPerplexity(pairedDocs);
// check: logLikelihood.
- ArrayList<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<>();
+ List<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<>();
docsSingleWord.add(new Tuple2<>(0L, Vectors.dense(1.0, 0.0, 0.0)));
JavaPairRDD<Long, Vector> single = JavaPairRDD.fromJavaRDD(jsc.parallelize(docsSingleWord));
double logLikelihood = toyModel.logLikelihood(single);
@@ -190,6 +186,6 @@ public class JavaLDASuite extends SharedSparkSession {
LDASuite.tinyTopicDescription();
private JavaPairRDD<Long, Vector> corpus;
private LocalLDAModel toyModel = LDASuite.toyModel();
- private ArrayList<Tuple2<Long, Vector>> toyData = LDASuite.javaToyData();
+ private List<Tuple2<Long, Vector>> toyData = LDASuite.javaToyData();
}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
index 3451e07737..15de566c88 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java
@@ -31,9 +31,9 @@ public class JavaAssociationRulesSuite extends SharedSparkSession {
@SuppressWarnings("unchecked")
JavaRDD<FPGrowth.FreqItemset<String>> freqItemsets = jsc.parallelize(Arrays.asList(
- new FreqItemset<String>(new String[]{"a"}, 15L),
- new FreqItemset<String>(new String[]{"b"}, 35L),
- new FreqItemset<String>(new String[]{"a", "b"}, 12L)
+ new FreqItemset<>(new String[]{"a"}, 15L),
+ new FreqItemset<>(new String[]{"b"}, 35L),
+ new FreqItemset<>(new String[]{"a", "b"}, 12L)
));
JavaRDD<AssociationRules.Rule<String>> results = (new AssociationRules()).run(freqItemsets);
diff --git a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java
index a46b1321b3..86c723aa00 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/regression/JavaLinearRegressionSuite.java
@@ -24,13 +24,13 @@ import org.junit.Test;
import org.apache.spark.SharedSparkSession;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.util.LinearDataGenerator;
public class JavaLinearRegressionSuite extends SharedSparkSession {
- int validatePrediction(List<LabeledPoint> validationData, LinearRegressionModel model) {
+ private static int validatePrediction(
+ List<LabeledPoint> validationData, LinearRegressionModel model) {
int numAccurate = 0;
for (LabeledPoint point : validationData) {
Double prediction = model.predict(point.features());
@@ -87,12 +87,7 @@ public class JavaLinearRegressionSuite extends SharedSparkSession {
LinearDataGenerator.generateLinearInputAsList(A, weights, nPoints, 42, 0.1), 2).cache();
LinearRegressionWithSGD linSGDImpl = new LinearRegressionWithSGD();
LinearRegressionModel model = linSGDImpl.run(testRDD.rdd());
- JavaRDD<Vector> vectors = testRDD.map(new Function<LabeledPoint, Vector>() {
- @Override
- public Vector call(LabeledPoint v) throws Exception {
- return v.features();
- }
- });
+ JavaRDD<Vector> vectors = testRDD.map(LabeledPoint::features);
JavaRDD<Double> predictions = model.predict(vectors);
// Should be able to get the first prediction.
predictions.first();
diff --git a/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java b/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java
index 1dcbbcaa02..0f71deb9ea 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/tree/JavaDecisionTreeSuite.java
@@ -25,8 +25,6 @@ import org.junit.Test;
import org.apache.spark.SharedSparkSession;
import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.configuration.Algo;
import org.apache.spark.mllib.tree.configuration.Strategy;
@@ -35,7 +33,7 @@ import org.apache.spark.mllib.tree.model.DecisionTreeModel;
public class JavaDecisionTreeSuite extends SharedSparkSession {
- int validatePrediction(List<LabeledPoint> validationData, DecisionTreeModel model) {
+ private static int validatePrediction(List<LabeledPoint> validationData, DecisionTreeModel model) {
int numCorrect = 0;
for (LabeledPoint point : validationData) {
Double prediction = model.predict(point.features());
@@ -63,7 +61,7 @@ public class JavaDecisionTreeSuite extends SharedSparkSession {
DecisionTreeModel model = learner.run(rdd.rdd());
int numCorrect = validatePrediction(arr, model);
- Assert.assertTrue(numCorrect == rdd.count());
+ Assert.assertEquals(numCorrect, rdd.count());
}
@Test
@@ -82,15 +80,10 @@ public class JavaDecisionTreeSuite extends SharedSparkSession {
DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy);
// java compatibility test
- JavaRDD<Double> predictions = model.predict(rdd.map(new Function<LabeledPoint, Vector>() {
- @Override
- public Vector call(LabeledPoint v1) {
- return v1.features();
- }
- }));
+ JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features));
int numCorrect = validatePrediction(arr, model);
- Assert.assertTrue(numCorrect == rdd.count());
+ Assert.assertEquals(numCorrect, rdd.count());
}
}