From 670891496a82538a5e2bf981a4044fb6f4cbb062 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 19 Jul 2016 12:31:04 +0100
Subject: [SPARK-16494][ML] Upgrade breeze version to 0.12

## What changes were proposed in this pull request?
breeze 0.12 has been released for more than half a year, and it brings lots of new features, performance improvement and bug fixes.
One of the biggest features is ```LBFGS-B``` which is an implementation of ```LBFGS``` with box constraints and much faster for some special case.
We would like to implement Huber loss function for ```LinearRegression``` ([SPARK-3181](https://issues.apache.org/jira/browse/SPARK-3181)) and it requires ```LBFGS-B``` as the optimization solver. So we should bump up the dependent breeze version to 0.12.
For more features, improvements and bug fixes of breeze 0.12, you can refer the following link:
https://groups.google.com/forum/#!topic/scala-breeze/nEeRi_DcY5c

## How was this patch tested?
No new tests, should pass the existing ones.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #14150 from yanboliang/spark-16494.
---
 .../org/apache/spark/ml/classification/LogisticRegression.scala   | 5 -----
 .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala    | 6 ------
 .../scala/org/apache/spark/ml/regression/LinearRegression.scala   | 5 -----
 .../main/scala/org/apache/spark/mllib/clustering/LDAModel.scala   | 8 +++++++-
 .../scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala    | 5 +++--
 .../main/scala/org/apache/spark/mllib/optimization/LBFGS.scala    | 5 -----
 6 files changed, 10 insertions(+), 24 deletions(-)

(limited to 'mllib/src/main')

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 91eee0e69d..7694773c81 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -424,11 +424,6 @@ class LogisticRegression @Since("1.2.0") (
           throw new SparkException(msg)
         }
 
-        if (!state.actuallyConverged) {
-          logWarning("LogisticRegression training finished but the result " +
-            s"is not converged because: ${state.convergedReason.get.reason}")
-        }
-
         /*
            The coefficients are trained in the scaled space; we're converting them back to
            the original space.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 700a92cc26..2b9912657f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -244,12 +244,6 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
         val msg = s"${optimizer.getClass.getName} failed."
         throw new SparkException(msg)
       }
-
-      if (!state.actuallyConverged) {
-        logWarning("AFTSurvivalRegression training finished but the result " +
-          s"is not converged because: ${state.convergedReason.get.reason}")
-      }
-
       state.x.toArray.clone()
     }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 0a155e1844..a0ff7f07aa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -325,11 +325,6 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
         throw new SparkException(msg)
       }
 
-      if (!state.actuallyConverged) {
-        logWarning("LinearRegression training finished but the result " +
-          s"is not converged because: ${state.convergedReason.get.reason}")
-      }
-
       /*
          The coefficients are trained in the scaled space; we're converting them back to
          the original space.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 9ebba1de0d..90d8a558f1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -784,7 +784,13 @@ class DistributedLDAModel private[clustering] (
   @Since("1.5.0")
   def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = {
     graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) =>
-      val topIndices = argtopk(topicCounts, k)
+      // TODO: Remove work-around for the breeze bug.
+      // https://github.com/scalanlp/breeze/issues/561
+      val topIndices = if (k == topicCounts.length) {
+        Seq.range(0, k)
+      } else {
+        argtopk(topicCounts, k)
+      }
       val sumCounts = sum(topicCounts)
       val weights = if (sumCounts != 0) {
         topicCounts(topIndices) / sumCounts
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 2436efba32..e2c6aca553 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -508,8 +508,9 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
     val weight = rho()
     val N = gammat.rows.toDouble
     val alpha = this.alpha.asBreeze.toDenseVector
-    val logphat: BDM[Double] = sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)) / N
-    val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat.toDenseVector)
+    val logphat: BDV[Double] =
+      sum(LDAUtils.dirichletExpectation(gammat)(::, breeze.linalg.*)).t / N
+    val gradf = N * (-LDAUtils.dirichletExpectation(alpha) + logphat)
 
     val c = N * trigamma(sum(alpha))
     val q = -N * trigamma(alpha)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index fd09f35277..e49363c2c6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -213,11 +213,6 @@ object LBFGS extends Logging {
     }
     lossHistory += state.value
 
-    if (!state.actuallyConverged) {
-      logWarning("LBFGS training finished but the result " +
-        s"is not converged because: ${state.convergedReason.get.reason}")
-    }
-
     val weights = Vectors.fromBreeze(state.x)
 
     val lossHistoryArray = lossHistory.result()
-- 
cgit v1.2.3