[SPARK-10490] [ML] Consolidate the Cholesky solvers in WeightedLeastSquares and ALS

Consolidate the Cholesky solvers in WeightedLeastSquares and ALS. Author: Yanbo Liang <ybliang8@gmail.com> Closes #8936 from yanboliang/spark-10490.
author: Yanbo Liang <ybliang8@gmail.com> 2015-10-07 15:50:45 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-10-07 15:50:45 -0700
commit: 7bf07faa716bd6a01252c5e888d0956096bde026 (patch)
tree: 9acef1a26d9f429210e499a7f63d28207c025daf /mllib
parent: 6dbfd7ecf41297213f4ce8024d00c40808c5ac8f (diff)
download: spark-7bf07faa716bd6a01252c5e888d0956096bde026.tar.gz
spark-7bf07faa716bd6a01252c5e888d0956096bde026.tar.bz2
spark-7bf07faa716bd6a01252c5e888d0956096bde026.zip
4 files changed, 47 insertions, 35 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
index 4374e99631..d7eaa5a926 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala
@@ -17,12 +17,8 @@
 
 package org.apache.spark.ml.optim
 
-import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
-import org.netlib.util.intW
-
 import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg._
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.rdd.RDD
 
 /**
@@ -110,7 +106,7 @@ private[ml] class WeightedLeastSquares(
       j += 1
     }
 
-    val x = choleskySolve(aaBar.values, abBar)
+    val x = new DenseVector(CholeskyDecomposition.solve(aaBar.values, abBar.values))
 
     // compute intercept
     val intercept = if (fitIntercept) {
@@ -121,23 +117,6 @@ private[ml] class WeightedLeastSquares(
 
     new WeightedLeastSquaresModel(x, intercept)
   }
-
-  /**
-   * Solves a symmetric positive definite linear system via Cholesky factorization.
-   * The input arguments are modified in-place to store the factorization and the solution.
-   * @param A the upper triangular part of A
-   * @param bx right-hand side
-   * @return the solution vector
-   */
-  // TODO: SPARK-10490 - consolidate this and the Cholesky solver in ALS
-  private def choleskySolve(A: Array[Double], bx: DenseVector): DenseVector = {
-    val k = bx.size
-    val info = new intW(0)
-    lapack.dppsv("U", k, 1, A, bx.values, k, info)
-    val code = info.`val`
-    assert(code == 0, s"lapack.dpotrs returned $code.")
-    bx
-  }
 }
 
 private[ml] object WeightedLeastSquares {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index f6f5281f71..535f266b9a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -26,9 +26,7 @@ import scala.util.Sorting
 import scala.util.hashing.byteswap64
 
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
-import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.netlib.util.intW
 
 import org.apache.spark.{Logging, Partitioner}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
@@ -36,6 +34,7 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.linalg.CholeskyDecomposition
 import org.apache.spark.mllib.optimization.NNLS
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
@@ -366,8 +365,6 @@ object ALS extends Logging {
   /** Cholesky solver for least square problems. */
   private[recommendation] class CholeskySolver extends LeastSquaresNESolver {
 
-    private val upper = "U"
-
     /**
      * Solves a least squares problem with L2 regularization:
      *
@@ -387,10 +384,7 @@ object ALS extends Logging {
         i += j
         j += 1
       }
-      val info = new intW(0)
-      lapack.dppsv(upper, k, 1, ne.ata, ne.atb, k, info)
-      val code = info.`val`
-      assert(code == 0, s"lapack.dppsv returned $code.")
+      CholeskyDecomposition.solve(ne.ata, ne.atb)
       val x = new Array[Float](k)
       i = 0
       while (i < k) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
new file mode 100644
index 0000000000..66eb40b6f4
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/CholeskyDecomposition.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
+import org.netlib.util.intW
+
+/**
+ * Compute Cholesky decomposition.
+ */
+private[spark] object CholeskyDecomposition {
+
+  /**
+   * Solves a symmetric positive definite linear system via Cholesky factorization.
+   * The input arguments are modified in-place to store the factorization and the solution.
+   * @param A the upper triangular part of A
+   * @param bx right-hand side
+   * @return the solution array
+   */
+  def solve(A: Array[Double], bx: Array[Double]): Array[Double] = {
+    val k = bx.size
+    val info = new intW(0)
+    lapack.dppsv("U", k, 1, A, bx, k, info)
+    val code = info.`val`
+    assert(code == 0, s"lapack.dpotrs returned $code.")
+    bx
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
index ae3ba3099c..863abe86d3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/EigenValueDecomposition.scala
@@ -21,13 +21,9 @@ import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
 import com.github.fommil.netlib.ARPACK
 import org.netlib.util.{intW, doubleW}
 
-import org.apache.spark.annotation.Experimental
-
 /**
- * :: Experimental ::
  * Compute eigen-decomposition.
  */
-@Experimental
 private[mllib] object EigenValueDecomposition {
   /**
    * Compute the leading k eigenvalues and eigenvectors on a symmetric square matrix using ARPACK.
@@ -46,7 +42,7 @@ private[mllib] object EigenValueDecomposition {
    *       for more details). The maximum number of Arnoldi update iterations is set to 300 in this
    *       function.
    */
-  private[mllib] def symmetricEigs(
+  def symmetricEigs(
       mul: BDV[Double] => BDV[Double],
       n: Int,
       k: Int,
author	Yanbo Liang <ybliang8@gmail.com>	2015-10-07 15:50:45 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-10-07 15:50:45 -0700
commit	7bf07faa716bd6a01252c5e888d0956096bde026 (patch)
tree	9acef1a26d9f429210e499a7f63d28207c025daf /mllib
parent	6dbfd7ecf41297213f4ce8024d00c40808c5ac8f (diff)
download	spark-7bf07faa716bd6a01252c5e888d0956096bde026.tar.gz spark-7bf07faa716bd6a01252c5e888d0956096bde026.tar.bz2 spark-7bf07faa716bd6a01252c5e888d0956096bde026.zip