From 670891496a82538a5e2bf981a4044fb6f4cbb062 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 19 Jul 2016 12:31:04 +0100 Subject: [SPARK-16494][ML] Upgrade breeze version to 0.12 ## What changes were proposed in this pull request? breeze 0.12 has been released for more than half a year, and it brings lots of new features, performance improvement and bug fixes. One of the biggest features is ```LBFGS-B``` which is an implementation of ```LBFGS``` with box constraints and much faster for some special case. We would like to implement Huber loss function for ```LinearRegression``` ([SPARK-3181](https://issues.apache.org/jira/browse/SPARK-3181)) and it requires ```LBFGS-B``` as the optimization solver. So we should bump up the dependent breeze version to 0.12. For more features, improvements and bug fixes of breeze 0.12, you can refer the following link: https://groups.google.com/forum/#!topic/scala-breeze/nEeRi_DcY5c ## How was this patch tested? No new tests, should pass the existing ones. Author: Yanbo Liang Closes #14150 from yanboliang/spark-16494. --- dev/deps/spark-deps-hadoop-2.2 | 5 +++-- dev/deps/spark-deps-hadoop-2.3 | 5 +++-- dev/deps/spark-deps-hadoop-2.4 | 5 +++-- dev/deps/spark-deps-hadoop-2.6 | 5 +++-- dev/deps/spark-deps-hadoop-2.7 | 5 +++-- .../org/apache/spark/ml/classification/LogisticRegression.scala | 5 ----- .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala | 6 ------ .../scala/org/apache/spark/ml/regression/LinearRegression.scala | 5 ----- .../main/scala/org/apache/spark/mllib/clustering/LDAModel.scala | 8 +++++++- .../scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala | 5 +++-- .../main/scala/org/apache/spark/mllib/optimization/LBFGS.scala | 5 ----- .../src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java | 6 +++++- .../test/scala/org/apache/spark/mllib/clustering/LDASuite.scala | 4 ++-- .../src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala | 9 ++++++--- pom.xml | 2 +- python/pyspark/ml/classification.py | 2 +- 16 files changed, 40 insertions(+), 42 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index feb3474cf1..5d536b7c24 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -12,8 +12,8 @@ avro-1.7.7.jar avro-ipc-1.7.7.jar avro-mapred-1.7.7-hadoop2.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -147,6 +147,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index 3e960358f7..d16f42a97d 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -15,8 +15,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -154,6 +154,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 3fc14a6fbf..2e261cb9a5 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -15,8 +15,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -154,6 +154,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 909fbb8852..67f38f4c22 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -19,8 +19,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -162,6 +162,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index b986a313a0..07583963d9 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -19,8 +19,8 @@ avro-mapred-1.7.7-hadoop2.jar base64-2.3.8.jar bcprov-jdk15on-1.51.jar bonecp-0.8.0.RELEASE.jar -breeze-macros_2.11-0.11.2.jar -breeze_2.11-0.11.2.jar +breeze-macros_2.11-0.12.jar +breeze_2.11-0.12.jar calcite-avatica-1.2.0-incubating.jar calcite-core-1.2.0-incubating.jar calcite-linq4j-1.2.0-incubating.jar @@ -163,6 +163,7 @@ scala-parser-combinators_2.11-1.0.4.jar scala-reflect-2.11.8.jar scala-xml_2.11-1.0.2.jar scalap-2.11.8.jar +shapeless_2.11-2.0.0.jar slf4j-api-1.7.16.jar slf4j-log4j12-1.7.16.jar snappy-0.2.jar diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 91eee0e69d..7694773c81 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -424,11 +424,6 @@ class LogisticRegression @Since("1.2.0") ( throw new SparkException(msg) } - if (!state.actuallyConverged) { - logWarning("LogisticRegression training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - /* The coefficients are trained in the scaled space; we're converting them back to the original space. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 700a92cc26..2b9912657f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -244,12 +244,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val msg = s"${optimizer.getClass.getName} failed." throw new SparkException(msg) } - - if (!state.actuallyConverged) { - logWarning("AFTSurvivalRegression training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - state.x.toArray.clone() } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 0a155e1844..a0ff7f07aa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -325,11 +325,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String throw new SparkException(msg) } - if (!state.actuallyConverged) { - logWarning("LinearRegression training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - /* The coefficients are trained in the scaled space; we're converting them back to the original space. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 9ebba1de0d..90d8a558f1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -784,7 +784,13 @@ class DistributedLDAModel private[clustering] ( @Since("1.5.0") def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => - val topIndices = argtopk(topicCounts, k) + // TODO: Remove work-around for the breeze bug. + // https://github.com/scalanlp/breeze/issues/561 + val topIndices = if (k == topicCounts.length) { + Seq.range(0, k) + } else { + argtopk(topicCounts, k) + } val sumCounts = sum(topicCounts) val weights = if (sumCounts != 0) { topicCounts(topIndices) / sumCounts diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 2436efba32..e2c6aca553 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -508,8 +508,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val weight = rho() val N = gammat.rows.toDouble val alpha = this.alpha.asBreeze.toDenseVector - val logphat: BDM[Double] = sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)) / N - val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat.toDenseVector) + val logphat: BDV[Double] = + sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)).t / N + val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat) val c = N * trigamma(sum(alpha)) val q = -N * trigamma(alpha) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index fd09f35277..e49363c2c6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -213,11 +213,6 @@ object LBFGS extends Logging { } lossHistory += state.value - if (!state.actuallyConverged) { - logWarning("LBFGS training finished but the result " + - s"is not converged because: ${state.convergedReason.get.reason}") - } - val weights = Vectors.fromBreeze(state.x) val lossHistoryArray = lossHistory.result() diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java index ac479c0841..8c0338e284 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaPCASuite.java @@ -107,7 +107,11 @@ public class JavaPCASuite extends SharedSparkSession { .fit(df); List result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect(); for (Row r : result) { - Assert.assertEquals(r.get(1), r.get(0)); + Vector calculatedVector = (Vector) r.get(0); + Vector expectedVector = (Vector) r.get(1); + for (int i = 0; i < calculatedVector.size(); i++) { + Assert.assertEquals(calculatedVector.apply(i), expectedVector.apply(i), 1.0e-8); + } } } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index eb050158d4..211e2bc026 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -118,8 +118,8 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(weights.length == 2) val bdvTopicDist = topicDistribution.asBreeze val top2Indices = argtopk(bdvTopicDist, 2) - assert(top2Indices.toArray === indices) - assert(bdvTopicDist(top2Indices).toArray === weights) + assert(top2Indices.toSet === indices.toSet) + assert(bdvTopicDist(top2Indices).toArray.toSet === weights.toSet) } // Check: log probabilities diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala index a8d82932d3..2f90afdcee 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/PCASuite.scala @@ -18,9 +18,10 @@ package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { @@ -42,7 +43,9 @@ class PCASuite extends SparkFunSuite with MLlibTestSparkContext { val pca_transform = pca.transform(dataRDD).collect() val mat_multiply = mat.multiply(pc).rows.collect() - assert(pca_transform.toSet === mat_multiply.toSet) - assert(pca.explainedVariance === explainedVariance) + pca_transform.zip(mat_multiply).foreach { case (calculated, expected) => + assert(calculated ~== expected relTol 1e-8) + } + assert(pca.explainedVariance ~== explainedVariance relTol 1e-8) } } diff --git a/pom.xml b/pom.xml index 4c8671a570..d064cb57dd 100644 --- a/pom.xml +++ b/pom.xml @@ -657,7 +657,7 @@ org.scalanlp breeze_${scala.binary.version} - 0.11.2 + 0.12 diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3c4af90aca..613bc8cb3e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1299,7 +1299,7 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable): >>> [x.coefficients for x in model.models] [DenseVector([3.3925, 1.8785]), DenseVector([-4.3016, -6.3163]), DenseVector([-4.5855, 6.1785])] >>> [x.intercept for x in model.models] - [-3.6474708290602034, 2.5507881951814495, -1.1016513228162115] + [-3.64747..., 2.55078..., -1.10165...] >>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0, 0.0))]).toDF() >>> model.transform(test0).head().prediction 1.0 -- cgit v1.2.3