[SPARK-11439][ML] Optimization of creating sparse feature without dense one

Sparse feature generated in LinearDataGenerator does not create dense vectors as an intermediate any more. Author: Nakul Jindal <njindal@us.ibm.com> Closes #9756 from nakul02/SPARK-11439_sparse_without_creating_dense_feature.
author: Nakul Jindal <njindal@us.ibm.com> 2015-12-08 11:08:27 +0000
committer: Sean Owen <sowen@cloudera.com> 2015-12-08 11:08:27 +0000
commit: 037b7e76a7f8b59e031873a768d81417dd180472 (patch)
tree: 2b782043ec03d134efdce90105482a934230b9e9 /mllib/src
parent: 708129187a460aca30790281e9221c0cd5e271df (diff)
download: spark-037b7e76a7f8b59e031873a768d81417dd180472.tar.gz
spark-037b7e76a7f8b59e031873a768d81417dd180472.tar.bz2
spark-037b7e76a7f8b59e031873a768d81417dd180472.zip
3 files changed, 142 insertions, 122 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 6ff07eed6c..094528e2ec 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -24,7 +24,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Since}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{BLAS, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -131,35 +131,27 @@ object LinearDataGenerator {
       eps: Double,
       sparsity: Double): Seq[LabeledPoint] = {
     require(0.0 <= sparsity && sparsity <= 1.0)
-    val rnd = new Random(seed)
-    val x = Array.fill[Array[Double]](nPoints)(
-      Array.fill[Double](weights.length)(rnd.nextDouble()))
-
-    val sparseRnd = new Random(seed)
-    x.foreach { v =>
-      var i = 0
-      val len = v.length
-      while (i < len) {
-        if (sparseRnd.nextDouble() < sparsity) {
-          v(i) = 0.0
-        } else {
-          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
-        }
-        i += 1
-      }
-    }
 
-    val y = x.map { xi =>
-      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
-    }
+    val rnd = new Random(seed)
+    def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)}
 
-    y.zip(x).map { p =>
-      if (sparsity == 0.0) {
+    if (sparsity == 0.0) {
+      (0 until nPoints).map { _ =>
+        val features = Vectors.dense(weights.indices.map { rndElement(_) }.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rnd.nextGaussian()
         // Return LabeledPoints with DenseVector
-        LabeledPoint(p._1, Vectors.dense(p._2))
-      } else {
+        LabeledPoint(label, features)
+      }
+    } else {
+      (0 until nPoints).map { _ =>
+        val indices = weights.indices.filter { _ => rnd.nextDouble() <= sparsity}
+        val values = indices.map { rndElement(_) }
+        val features = Vectors.sparse(weights.length, indices.toArray, values.toArray)
+        val label = BLAS.dot(Vectors.dense(weights), features) +
+          intercept + eps * rnd.nextGaussian()
         // Return LabeledPoints with SparseVector
-        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+        LabeledPoint(label, features)
       }
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 60886bf77d..954d3bedc1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -65,15 +65,15 @@ class RegressionEvaluatorSuite
 
     // default = rmse
     val evaluator = new RegressionEvaluator()
-    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)
 
     // r2 score
     evaluator.setMetricName("r2")
-    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)
 
     // mae
     evaluator.setMetricName("mae")
-    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 2bdc0e184d..2f3e703f4c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -42,6 +42,7 @@ class LinearRegressionSuite
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
      is the same as the one trained by R's glmnet package. The following instruction
      describes how to reproduce the data in R.
+     In a spark-shell, use the following code:
 
      import org.apache.spark.mllib.util.LinearDataGenerator
      val data =
@@ -184,15 +185,15 @@ class LinearRegressionSuite
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
-         as.numeric.data.V2. 6.995908
-         as.numeric.data.V3. 5.275131
+         as.numeric.data.V2. 6.973403
+         as.numeric.data.V3. 5.284370
        */
-      val coefficientsR = Vectors.dense(6.995908, 5.275131)
+      val coefficientsR = Vectors.dense(6.973403, 5.284370)
 
-      assert(model1.intercept ~== 0 absTol 1E-3)
-      assert(model1.coefficients ~= coefficientsR relTol 1E-3)
-      assert(model2.intercept ~== 0 absTol 1E-3)
-      assert(model2.coefficients ~= coefficientsR relTol 1E-3)
+      assert(model1.intercept ~== 0 absTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR relTol 1E-2)
+      assert(model2.intercept ~== 0 absTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR relTol 1E-2)
 
       /*
          Then again with the data with no intercept:
@@ -235,14 +236,14 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                     s0
-           (Intercept)         6.24300
-           as.numeric.data.V2. 4.024821
-           as.numeric.data.V3. 6.679841
+           (Intercept)       6.242284
+           as.numeric.d1.V2. 4.019605
+           as.numeric.d1.V3. 6.679538
          */
-        val interceptR1 = 6.24300
-        val coefficientsR1 = Vectors.dense(4.024821, 6.679841)
-        assert(model1.intercept ~== interceptR1 relTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        val interceptR1 = 6.242284
+        val coefficientsR1 = Vectors.dense(4.019605, 6.679538)
+        assert(model1.intercept ~== interceptR1 relTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
@@ -296,14 +297,14 @@ class LinearRegressionSuite
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)          .
-           as.numeric.data.V2. 6.299752
-           as.numeric.data.V3. 4.772913
+           as.numeric.data.V2. 6.272927
+           as.numeric.data.V3. 4.782604
          */
         val interceptR1 = 0.0
-        val coefficientsR1 = Vectors.dense(6.299752, 4.772913)
+        val coefficientsR1 = Vectors.dense(6.272927, 4.782604)
 
-        assert(model1.intercept ~== interceptR1 absTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        assert(model1.intercept ~== interceptR1 absTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
@@ -312,14 +313,14 @@ class LinearRegressionSuite
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         .
-           as.numeric.data.V2. 6.232193
-           as.numeric.data.V3. 4.764229
+           as.numeric.data.V2. 6.207817
+           as.numeric.data.V3. 4.775780
          */
         val interceptR2 = 0.0
-        val coefficientsR2 = Vectors.dense(6.232193, 4.764229)
+        val coefficientsR2 = Vectors.dense(6.207817, 4.775780)
 
-        assert(model2.intercept ~== interceptR2 absTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+        assert(model2.intercept ~== interceptR2 absTol 1E-2)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
@@ -347,15 +348,15 @@ class LinearRegressionSuite
          > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
-         (Intercept)         5.269376
-         as.numeric.data.V2. 3.736216
-         as.numeric.data.V3. 5.712356)
+         (Intercept)       5.260103
+         as.numeric.d1.V2. 3.725522
+         as.numeric.d1.V3. 5.711203
        */
-      val interceptR1 = 5.269376
-      val coefficientsR1 = Vectors.dense(3.736216, 5.712356)
+      val interceptR1 = 5.260103
+      val coefficientsR1 = Vectors.dense(3.725522, 5.711203)
 
-      assert(model1.intercept ~== interceptR1 relTol 1E-3)
-      assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+      assert(model1.intercept ~== interceptR1 relTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
       /*
          coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -363,15 +364,15 @@ class LinearRegressionSuite
          > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
-         (Intercept)         5.791109
-         as.numeric.data.V2. 3.435466
-         as.numeric.data.V3. 5.910406
+         (Intercept)       5.790885
+         as.numeric.d1.V2. 3.432373
+         as.numeric.d1.V3. 5.919196
        */
-      val interceptR2 = 5.791109
-      val coefficientsR2 = Vectors.dense(3.435466, 5.910406)
+      val interceptR2 = 5.790885
+      val coefficientsR2 = Vectors.dense(3.432373, 5.919196)
 
-      assert(model2.intercept ~== interceptR2 relTol 1E-3)
-      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+      assert(model2.intercept ~== interceptR2 relTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
@@ -398,15 +399,15 @@ class LinearRegressionSuite
          > coefficients
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
-         (Intercept)         .
-         as.numeric.data.V2. 5.522875
-         as.numeric.data.V3. 4.214502
+         (Intercept)       .
+         as.numeric.d1.V2. 5.493430
+         as.numeric.d1.V3. 4.223082
        */
       val interceptR1 = 0.0
-      val coefficientsR1 = Vectors.dense(5.522875, 4.214502)
+      val coefficientsR1 = Vectors.dense(5.493430, 4.223082)
 
-      assert(model1.intercept ~== interceptR1 absTol 1E-3)
-      assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+      assert(model1.intercept ~== interceptR1 absTol 1E-2)
+      assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
       /*
          coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -415,14 +416,14 @@ class LinearRegressionSuite
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)         .
-         as.numeric.data.V2. 5.263704
-         as.numeric.data.V3. 4.187419
+         as.numeric.d1.V2. 5.244324
+         as.numeric.d1.V3. 4.203106
        */
       val interceptR2 = 0.0
-      val coefficientsR2 = Vectors.dense(5.263704, 4.187419)
+      val coefficientsR2 = Vectors.dense(5.244324, 4.203106)
 
-      assert(model2.intercept ~== interceptR2 absTol 1E-3)
-      assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+      assert(model2.intercept ~== interceptR2 absTol 1E-2)
+      assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
       model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
@@ -457,15 +458,15 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
-           (Intercept)         6.324108
-           as.numeric.data.V2. 3.168435
-           as.numeric.data.V3. 5.200403
+           (Intercept)       5.689855
+           as.numeric.d1.V2. 3.661181
+           as.numeric.d1.V3. 6.000274
          */
-        val interceptR1 = 5.696056
-        val coefficientsR1 = Vectors.dense(3.670489, 6.001122)
+        val interceptR1 = 5.689855
+        val coefficientsR1 = Vectors.dense(3.661181, 6.000274)
 
-        assert(model1.intercept ~== interceptR1 relTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        assert(model1.intercept ~== interceptR1 relTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
@@ -473,15 +474,15 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
-           (Intercept)         6.114723
-           as.numeric.data.V2. 3.409937
-           as.numeric.data.V3. 6.146531
+           (Intercept)       6.113890
+           as.numeric.d1.V2. 3.407021
+           as.numeric.d1.V3. 6.152512
          */
-        val interceptR2 = 6.114723
-        val coefficientsR2 = Vectors.dense(3.409937, 6.146531)
+        val interceptR2 = 6.113890
+        val coefficientsR2 = Vectors.dense(3.407021, 6.152512)
 
-        assert(model2.intercept ~== interceptR2 relTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+        assert(model2.intercept ~== interceptR2 relTol 1E-2)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
@@ -518,15 +519,15 @@ class LinearRegressionSuite
            > coefficients
             3 x 1 sparse Matrix of class "dgCMatrix"
                                       s0
-           (Intercept)         .
-           as.numeric.dataM.V2. 5.673348
-           as.numeric.dataM.V3. 4.322251
+           (Intercept)       .
+           as.numeric.d1.V2. 5.643748
+           as.numeric.d1.V3. 4.331519
          */
         val interceptR1 = 0.0
-        val coefficientsR1 = Vectors.dense(5.673348, 4.322251)
+        val coefficientsR1 = Vectors.dense(5.643748, 4.331519)
 
-        assert(model1.intercept ~== interceptR1 absTol 1E-3)
-        assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+        assert(model1.intercept ~== interceptR1 absTol 1E-2)
+        assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
 
         /*
            coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
@@ -535,14 +536,15 @@ class LinearRegressionSuite
             3 x 1 sparse Matrix of class "dgCMatrix"
                                      s0
            (Intercept)         .
-           as.numeric.data.V2. 5.477988
-           as.numeric.data.V3. 4.297622
+           as.numeric.d1.V2. 5.455902
+           as.numeric.d1.V3. 4.312266
+
          */
         val interceptR2 = 0.0
-        val coefficientsR2 = Vectors.dense(5.477988, 4.297622)
+        val coefficientsR2 = Vectors.dense(5.455902, 4.312266)
 
-        assert(model2.intercept ~== interceptR2 absTol 1E-3)
-        assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+        assert(model2.intercept ~== interceptR2 absTol 1E-2)
+        assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
         model1.transform(datasetWithDenseFeature).select("features", "prediction")
           .collect().foreach {
@@ -592,21 +594,47 @@ class LinearRegressionSuite
       }
 
       /*
-         Use the following R code to generate model training results.
-
-         predictions <- predict(fit, newx=features)
-         residuals <- label - predictions
-         > mean(residuals^2) # MSE
-         [1] 0.009720325
-         > mean(abs(residuals)) # MAD
-         [1] 0.07863206
-         > cor(predictions, label)^2# r^2
-                 [,1]
-         s0 0.9998749
+         # Use the following R code to generate model training results.
+
+         # path/part-00000 is the file generated by running LinearDataGenerator.generateLinearInput
+         # as described before the beforeAll() method.
+         d1 <- read.csv("path/part-00000", header=FALSE, stringsAsFactors=FALSE)
+         fit <- glm(V1 ~ V2 + V3, data = d1, family = "gaussian")
+         names(f1)[1] = c("V2")
+         names(f1)[2] = c("V3")
+         f1 <- data.frame(as.numeric(d1$V2), as.numeric(d1$V3))
+         predictions <- predict(fit, newdata=f1)
+         l1 <- as.numeric(d1$V1)
+
+         residuals <- l1 - predictions
+         > mean(residuals^2)           # MSE
+         [1] 0.00985449
+         > mean(abs(residuals))        # MAD
+         [1] 0.07961668
+         > cor(predictions, l1)^2   # r^2
+         [1] 0.9998737
+
+         > summary(fit)
+
+          Call:
+          glm(formula = V1 ~ V2 + V3, family = "gaussian", data = d1)
+
+          Deviance Residuals:
+               Min        1Q    Median        3Q       Max
+          -0.47082  -0.06797   0.00002   0.06725   0.34635
+
+          Coefficients:
+                       Estimate Std. Error t value Pr(>|t|)
+          (Intercept) 6.3022157  0.0018600    3388   <2e-16 ***
+          V2          4.6982442  0.0011805    3980   <2e-16 ***
+          V3          7.1994344  0.0009044    7961   <2e-16 ***
+          ---
+
+          ....
        */
-      assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
-      assert(model.summary.meanAbsoluteError ~== 0.07863206 relTol 1E-5)
-      assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+      assert(model.summary.meanSquaredError ~== 0.00985449 relTol 1E-4)
+      assert(model.summary.meanAbsoluteError ~== 0.07961668 relTol 1E-4)
+      assert(model.summary.r2 ~== 0.9998737 relTol 1E-4)
 
       // Normal solver uses "WeightedLeastSquares". This algorithm does not generate
       // objective history because it does not run through iterations.
@@ -621,14 +649,14 @@ class LinearRegressionSuite
         // To clalify that the normal solver is used here.
         assert(model.summary.objectiveHistory.length == 1)
         assert(model.summary.objectiveHistory(0) == 0.0)
-        val devianceResidualsR = Array(-0.35566, 0.34504)
-        val seCoefR = Array(0.0011756, 0.0009032, 0.0018489)
-        val tValsR = Array(3998, 7971, 3407)
+        val devianceResidualsR = Array(-0.47082, 0.34635)
+        val seCoefR = Array(0.0011805, 0.0009044, 0.0018600)
+        val tValsR = Array(3980, 7961, 3388)
         val pValsR = Array(0, 0, 0)
         model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
-          assert(x._1 ~== x._2 absTol 1E-5) }
+          assert(x._1 ~== x._2 absTol 1E-4) }
         model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
-          assert(x._1 ~== x._2 absTol 1E-5) }
+          assert(x._1 ~== x._2 absTol 1E-4) }
         model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) }
         model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) }
       }
author	Nakul Jindal <njindal@us.ibm.com>	2015-12-08 11:08:27 +0000
committer	Sean Owen <sowen@cloudera.com>	2015-12-08 11:08:27 +0000
commit	037b7e76a7f8b59e031873a768d81417dd180472 (patch)
tree	2b782043ec03d134efdce90105482a934230b9e9 /mllib/src
parent	708129187a460aca30790281e9221c0cd5e271df (diff)
download	spark-037b7e76a7f8b59e031873a768d81417dd180472.tar.gz spark-037b7e76a7f8b59e031873a768d81417dd180472.tar.bz2 spark-037b7e76a7f8b59e031873a768d81417dd180472.zip