[SPARK-8314][MLlib] improvement in performance of MLUtils.appendBias

MLUtils.appendBias method is heavily used in creating intercepts for linear models. This method uses Breeze's vector concatenation which is very slow compared to the plain System.arrayCopy. This improvement is to change the implementation to use System.arrayCopy. I saw the following performance improvements after the change: Benchmark with mnist dataset for 50 times: MLUtils.appendBias (SparseVector Before): 47320 ms MLUtils.appendBias (SparseVector After): 1935 ms MLUtils.appendBias (DenseVector Before): 5340 ms MLUtils.appendBias (DenseVector After): 4080 ms This is almost a 24 times performance boost for SparseVectors. Author: Roger Menezes <rmenezes@netflix.com> Closes #6768 from rogermenezes/improve-append-bias and squashes the following commits: 4e42f75 [Roger Menezes] address feedback e999d79 [Roger Menezes] first commit
author: Roger Menezes <rmenezes@netflix.com> 2015-06-12 18:29:58 -0700
committer: DB Tsai <dbt@netflix.com> 2015-06-12 18:29:58 -0700
commit: 6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34 (patch)
tree: b2119951c52ce72f894c859c5a64918b9d3c3971
parent: e9471d3414d327c7d0853e18f1844ab1bd09c8ed (diff)
download: spark-6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34.tar.gz
spark-6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34.tar.bz2
spark-6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34.zip
3 files changed, 25 insertions, 9 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index b12f833ce9..3cf193f353 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -145,9 +145,9 @@ object LogisticRegressionExample {
     val elapsedTime = (System.nanoTime() - startTime) / 1e9
     println(s"Training time: $elapsedTime seconds")
 
-    val lirModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
+    val lorModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
     // Print the weights and intercept for logistic regression.
-    println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+    println(s"Weights: ${lorModel.weights} Intercept: ${lorModel.intercept}")
 
     println("Training data results:")
     DecisionTreeExample.evaluateClassificationModel(pipelineModel, training, "indexedLabel")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 557119f7b1..3523f18043 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -213,9 +213,9 @@ private[spark] object BLAS extends Serializable with Logging {
   def scal(a: Double, x: Vector): Unit = {
     x match {
       case sx: SparseVector =>
-        f2jBLAS.dscal(sx.values.size, a, sx.values, 1)
+        f2jBLAS.dscal(sx.values.length, a, sx.values, 1)
       case dx: DenseVector =>
-        f2jBLAS.dscal(dx.values.size, a, dx.values, 1)
+        f2jBLAS.dscal(dx.values.length, a, dx.values, 1)
       case _ =>
         throw new IllegalArgumentException(s"scal doesn't support vector type ${x.getClass}.")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 52d6468a72..7c5cfa7bd8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -270,12 +270,28 @@ object MLUtils {
    * Returns a new vector with `1.0` (bias) appended to the input vector.
    */
   def appendBias(vector: Vector): Vector = {
-    val vector1 = vector.toBreeze match {
-      case dv: BDV[Double] => BDV.vertcat(dv, new BDV[Double](Array(1.0)))
-      case sv: BSV[Double] => BSV.vertcat(sv, new BSV[Double](Array(0), Array(1.0), 1))
-      case v: Any => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
+    vector match {
+      case dv: DenseVector =>
+        val inputValues = dv.values
+        val inputLength = inputValues.length
+        val outputValues = Array.ofDim[Double](inputLength + 1)
+        System.arraycopy(inputValues, 0, outputValues, 0, inputLength)
+        outputValues(inputLength) = 1.0
+        Vectors.dense(outputValues)
+      case sv: SparseVector =>
+        val inputValues = sv.values
+        val inputIndices = sv.indices
+        val inputValuesLength = inputValues.length
+        val dim = sv.size
+        val outputValues = Array.ofDim[Double](inputValuesLength + 1)
+        val outputIndices = Array.ofDim[Int](inputValuesLength + 1)
+        System.arraycopy(inputValues, 0, outputValues, 0, inputValuesLength)
+        System.arraycopy(inputIndices, 0, outputIndices, 0, inputValuesLength)
+        outputValues(inputValuesLength) = 1.0
+        outputIndices(inputValuesLength) = dim
+        Vectors.sparse(dim + 1, outputIndices, outputValues)
+      case _ => throw new IllegalArgumentException(s"Do not support vector type ${vector.getClass}")
     }
-    Vectors.fromBreeze(vector1)
   }
 
   /**
author	Roger Menezes <rmenezes@netflix.com>	2015-06-12 18:29:58 -0700
committer	DB Tsai <dbt@netflix.com>	2015-06-12 18:29:58 -0700
commit	6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34 (patch)
tree	b2119951c52ce72f894c859c5a64918b9d3c3971
parent	e9471d3414d327c7d0853e18f1844ab1bd09c8ed (diff)
download	spark-6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34.tar.gz spark-6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34.tar.bz2 spark-6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34.zip