[SPARK-4596][MLLib] Refactorize Normalizer to make code cleaner

In this refactoring, the performance will be slightly increased due to removing the overhead from breeze vector. The bottleneck is still in breeze norm which is implemented by activeIterator. This inefficiency of breeze norm will be addressed in next PR. At least, this PR makes the code more consistent in the codebase. Author: DB Tsai <dbtsai@alpinenow.com> Closes #3446 from dbtsai/normalizer and squashes the following commits: e20a2b9 [DB Tsai] first commit (cherry picked from commit 89f912264603741c7d980135c26102d63e11791f) Signed-off-by: Xiangrui Meng <meng@databricks.com>
author: DB Tsai <dbtsai@alpinenow.com> 2014-11-25 01:57:34 -0800
committer: Xiangrui Meng <meng@databricks.com> 2014-11-25 01:57:44 -0800
commit: 74571991b894a3b1ec47644d850a64276252b3fb (patch)
tree: 941499605ac91f17415fd5a03a7050dec5bf11a8 /mllib
parent: 1f4d1ac4bc782f888757073ee2becf59a5251774 (diff)
download: spark-74571991b894a3b1ec47644d850a64276252b3fb.tar.gz
spark-74571991b894a3b1ec47644d850a64276252b3fb.tar.bz2
spark-74571991b894a3b1ec47644d850a64276252b3fb.zip
1 files changed, 19 insertions, 10 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
index dfad25d57c..a9c2e23717 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.mllib.feature
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, norm => brzNorm}
+import breeze.linalg.{norm => brzNorm}
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 
 /**
  * :: Experimental ::
@@ -47,22 +47,31 @@ class Normalizer(p: Double) extends VectorTransformer {
    * @return normalized vector. If the norm of the input is zero, it will return the input vector.
    */
   override def transform(vector: Vector): Vector = {
-    var norm = brzNorm(vector.toBreeze, p)
+    val norm = brzNorm(vector.toBreeze, p)
 
     if (norm != 0.0) {
       // For dense vector, we've to allocate new memory for new output vector.
       // However, for sparse vector, the `index` array will not be changed,
       // so we can re-use it to save memory.
-      vector.toBreeze match {
-        case dv: BDV[Double] => Vectors.fromBreeze(dv :/ norm)
-        case sv: BSV[Double] =>
-          val output = new BSV[Double](sv.index, sv.data.clone(), sv.length)
+      vector match {
+        case dv: DenseVector =>
+          val values = dv.values.clone()
+          val size = values.size
           var i = 0
-          while (i < output.data.length) {
-            output.data(i) /= norm
+          while (i < size) {
+            values(i) /= norm
             i += 1
           }
-          Vectors.fromBreeze(output)
+          Vectors.dense(values)
+        case sv: SparseVector =>
+          val values = sv.values.clone()
+          val nnz = values.size
+          var i = 0
+          while (i < nnz) {
+            values(i) /= norm
+            i += 1
+          }
+          Vectors.sparse(sv.size, sv.indices, values)
         case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
       }
     } else {
author	DB Tsai <dbtsai@alpinenow.com>	2014-11-25 01:57:34 -0800
committer	Xiangrui Meng <meng@databricks.com>	2014-11-25 01:57:44 -0800
commit	74571991b894a3b1ec47644d850a64276252b3fb (patch)
tree	941499605ac91f17415fd5a03a7050dec5bf11a8 /mllib
parent	1f4d1ac4bc782f888757073ee2becf59a5251774 (diff)
download	spark-74571991b894a3b1ec47644d850a64276252b3fb.tar.gz spark-74571991b894a3b1ec47644d850a64276252b3fb.tar.bz2 spark-74571991b894a3b1ec47644d850a64276252b3fb.zip