aboutsummaryrefslogtreecommitdiff
path: root/mllib/src
diff options
context:
space:
mode:
authorNakul Jindal <njindal@us.ibm.com>2015-12-08 11:08:27 +0000
committerSean Owen <sowen@cloudera.com>2015-12-08 11:08:27 +0000
commit037b7e76a7f8b59e031873a768d81417dd180472 (patch)
tree2b782043ec03d134efdce90105482a934230b9e9 /mllib/src
parent708129187a460aca30790281e9221c0cd5e271df (diff)
downloadspark-037b7e76a7f8b59e031873a768d81417dd180472.tar.gz
spark-037b7e76a7f8b59e031873a768d81417dd180472.tar.bz2
spark-037b7e76a7f8b59e031873a768d81417dd180472.zip
[SPARK-11439][ML] Optimization of creating sparse feature without dense one
Sparse feature generated in LinearDataGenerator does not create dense vectors as an intermediate any more. Author: Nakul Jindal <njindal@us.ibm.com> Closes #9756 from nakul02/SPARK-11439_sparse_without_creating_dense_feature.
Diffstat (limited to 'mllib/src')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala44
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala6
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala214
3 files changed, 142 insertions, 122 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 6ff07eed6c..094528e2ec 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -24,7 +24,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{BLAS, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
@@ -131,35 +131,27 @@ object LinearDataGenerator {
eps: Double,
sparsity: Double): Seq[LabeledPoint] = {
require(0.0 <= sparsity && sparsity <= 1.0)
- val rnd = new Random(seed)
- val x = Array.fill[Array[Double]](nPoints)(
- Array.fill[Double](weights.length)(rnd.nextDouble()))
-
- val sparseRnd = new Random(seed)
- x.foreach { v =>
- var i = 0
- val len = v.length
- while (i < len) {
- if (sparseRnd.nextDouble() < sparsity) {
- v(i) = 0.0
- } else {
- v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
- }
- i += 1
- }
- }
- val y = x.map { xi =>
- blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
- }
+ val rnd = new Random(seed)
+ def rndElement(i: Int) = {(rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)}
- y.zip(x).map { p =>
- if (sparsity == 0.0) {
+ if (sparsity == 0.0) {
+ (0 until nPoints).map { _ =>
+ val features = Vectors.dense(weights.indices.map { rndElement(_) }.toArray)
+ val label = BLAS.dot(Vectors.dense(weights), features) +
+ intercept + eps * rnd.nextGaussian()
// Return LabeledPoints with DenseVector
- LabeledPoint(p._1, Vectors.dense(p._2))
- } else {
+ LabeledPoint(label, features)
+ }
+ } else {
+ (0 until nPoints).map { _ =>
+ val indices = weights.indices.filter { _ => rnd.nextDouble() <= sparsity}
+ val values = indices.map { rndElement(_) }
+ val features = Vectors.sparse(weights.length, indices.toArray, values.toArray)
+ val label = BLAS.dot(Vectors.dense(weights), features) +
+ intercept + eps * rnd.nextGaussian()
// Return LabeledPoints with SparseVector
- LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+ LabeledPoint(label, features)
}
}
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index 60886bf77d..954d3bedc1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -65,15 +65,15 @@ class RegressionEvaluatorSuite
// default = rmse
val evaluator = new RegressionEvaluator()
- assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)
+ assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)
// r2 score
evaluator.setMetricName("r2")
- assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)
+ assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)
// mae
evaluator.setMetricName("mae")
- assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
+ assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
}
test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 2bdc0e184d..2f3e703f4c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -42,6 +42,7 @@ class LinearRegressionSuite
In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
is the same as the one trained by R's glmnet package. The following instruction
describes how to reproduce the data in R.
+ In a spark-shell, use the following code:
import org.apache.spark.mllib.util.LinearDataGenerator
val data =
@@ -184,15 +185,15 @@ class LinearRegressionSuite
3 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- as.numeric.data.V2. 6.995908
- as.numeric.data.V3. 5.275131
+ as.numeric.data.V2. 6.973403
+ as.numeric.data.V3. 5.284370
*/
- val coefficientsR = Vectors.dense(6.995908, 5.275131)
+ val coefficientsR = Vectors.dense(6.973403, 5.284370)
- assert(model1.intercept ~== 0 absTol 1E-3)
- assert(model1.coefficients ~= coefficientsR relTol 1E-3)
- assert(model2.intercept ~== 0 absTol 1E-3)
- assert(model2.coefficients ~= coefficientsR relTol 1E-3)
+ assert(model1.intercept ~== 0 absTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR relTol 1E-2)
+ assert(model2.intercept ~== 0 absTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR relTol 1E-2)
/*
Then again with the data with no intercept:
@@ -235,14 +236,14 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 6.24300
- as.numeric.data.V2. 4.024821
- as.numeric.data.V3. 6.679841
+ (Intercept) 6.242284
+ as.numeric.d1.V2. 4.019605
+ as.numeric.d1.V3. 6.679538
*/
- val interceptR1 = 6.24300
- val coefficientsR1 = Vectors.dense(4.024821, 6.679841)
- assert(model1.intercept ~== interceptR1 relTol 1E-3)
- assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+ val interceptR1 = 6.242284
+ val coefficientsR1 = Vectors.dense(4.019605, 6.679538)
+ assert(model1.intercept ~== interceptR1 relTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
/*
coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
@@ -296,14 +297,14 @@ class LinearRegressionSuite
3 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- as.numeric.data.V2. 6.299752
- as.numeric.data.V3. 4.772913
+ as.numeric.data.V2. 6.272927
+ as.numeric.data.V3. 4.782604
*/
val interceptR1 = 0.0
- val coefficientsR1 = Vectors.dense(6.299752, 4.772913)
+ val coefficientsR1 = Vectors.dense(6.272927, 4.782604)
- assert(model1.intercept ~== interceptR1 absTol 1E-3)
- assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+ assert(model1.intercept ~== interceptR1 absTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
/*
coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0,
@@ -312,14 +313,14 @@ class LinearRegressionSuite
3 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- as.numeric.data.V2. 6.232193
- as.numeric.data.V3. 4.764229
+ as.numeric.data.V2. 6.207817
+ as.numeric.data.V3. 4.775780
*/
val interceptR2 = 0.0
- val coefficientsR2 = Vectors.dense(6.232193, 4.764229)
+ val coefficientsR2 = Vectors.dense(6.207817, 4.775780)
- assert(model2.intercept ~== interceptR2 absTol 1E-3)
- assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+ assert(model2.intercept ~== interceptR2 absTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
model1.transform(datasetWithDenseFeature).select("features", "prediction")
.collect().foreach {
@@ -347,15 +348,15 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 5.269376
- as.numeric.data.V2. 3.736216
- as.numeric.data.V3. 5.712356)
+ (Intercept) 5.260103
+ as.numeric.d1.V2. 3.725522
+ as.numeric.d1.V3. 5.711203
*/
- val interceptR1 = 5.269376
- val coefficientsR1 = Vectors.dense(3.736216, 5.712356)
+ val interceptR1 = 5.260103
+ val coefficientsR1 = Vectors.dense(3.725522, 5.711203)
- assert(model1.intercept ~== interceptR1 relTol 1E-3)
- assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+ assert(model1.intercept ~== interceptR1 relTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
/*
coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -363,15 +364,15 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 5.791109
- as.numeric.data.V2. 3.435466
- as.numeric.data.V3. 5.910406
+ (Intercept) 5.790885
+ as.numeric.d1.V2. 3.432373
+ as.numeric.d1.V3. 5.919196
*/
- val interceptR2 = 5.791109
- val coefficientsR2 = Vectors.dense(3.435466, 5.910406)
+ val interceptR2 = 5.790885
+ val coefficientsR2 = Vectors.dense(3.432373, 5.919196)
- assert(model2.intercept ~== interceptR2 relTol 1E-3)
- assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+ assert(model2.intercept ~== interceptR2 relTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
case Row(features: DenseVector, prediction1: Double) =>
@@ -398,15 +399,15 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) .
- as.numeric.data.V2. 5.522875
- as.numeric.data.V3. 4.214502
+ (Intercept) .
+ as.numeric.d1.V2. 5.493430
+ as.numeric.d1.V3. 4.223082
*/
val interceptR1 = 0.0
- val coefficientsR1 = Vectors.dense(5.522875, 4.214502)
+ val coefficientsR1 = Vectors.dense(5.493430, 4.223082)
- assert(model1.intercept ~== interceptR1 absTol 1E-3)
- assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+ assert(model1.intercept ~== interceptR1 absTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
/*
coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -415,14 +416,14 @@ class LinearRegressionSuite
3 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- as.numeric.data.V2. 5.263704
- as.numeric.data.V3. 4.187419
+ as.numeric.d1.V2. 5.244324
+ as.numeric.d1.V3. 4.203106
*/
val interceptR2 = 0.0
- val coefficientsR2 = Vectors.dense(5.263704, 4.187419)
+ val coefficientsR2 = Vectors.dense(5.244324, 4.203106)
- assert(model2.intercept ~== interceptR2 absTol 1E-3)
- assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+ assert(model2.intercept ~== interceptR2 absTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
case Row(features: DenseVector, prediction1: Double) =>
@@ -457,15 +458,15 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 6.324108
- as.numeric.data.V2. 3.168435
- as.numeric.data.V3. 5.200403
+ (Intercept) 5.689855
+ as.numeric.d1.V2. 3.661181
+ as.numeric.d1.V3. 6.000274
*/
- val interceptR1 = 5.696056
- val coefficientsR1 = Vectors.dense(3.670489, 6.001122)
+ val interceptR1 = 5.689855
+ val coefficientsR1 = Vectors.dense(3.661181, 6.000274)
- assert(model1.intercept ~== interceptR1 relTol 1E-3)
- assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+ assert(model1.intercept ~== interceptR1 relTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
/*
coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
@@ -473,15 +474,15 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) 6.114723
- as.numeric.data.V2. 3.409937
- as.numeric.data.V3. 6.146531
+ (Intercept) 6.113890
+ as.numeric.d1.V2. 3.407021
+ as.numeric.d1.V3. 6.152512
*/
- val interceptR2 = 6.114723
- val coefficientsR2 = Vectors.dense(3.409937, 6.146531)
+ val interceptR2 = 6.113890
+ val coefficientsR2 = Vectors.dense(3.407021, 6.152512)
- assert(model2.intercept ~== interceptR2 relTol 1E-3)
- assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+ assert(model2.intercept ~== interceptR2 relTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
model1.transform(datasetWithDenseFeature).select("features", "prediction")
.collect().foreach {
@@ -518,15 +519,15 @@ class LinearRegressionSuite
> coefficients
3 x 1 sparse Matrix of class "dgCMatrix"
s0
- (Intercept) .
- as.numeric.dataM.V2. 5.673348
- as.numeric.dataM.V3. 4.322251
+ (Intercept) .
+ as.numeric.d1.V2. 5.643748
+ as.numeric.d1.V3. 4.331519
*/
val interceptR1 = 0.0
- val coefficientsR1 = Vectors.dense(5.673348, 4.322251)
+ val coefficientsR1 = Vectors.dense(5.643748, 4.331519)
- assert(model1.intercept ~== interceptR1 absTol 1E-3)
- assert(model1.coefficients ~= coefficientsR1 relTol 1E-3)
+ assert(model1.intercept ~== interceptR1 absTol 1E-2)
+ assert(model1.coefficients ~= coefficientsR1 relTol 1E-2)
/*
coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3,
@@ -535,14 +536,15 @@ class LinearRegressionSuite
3 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
- as.numeric.data.V2. 5.477988
- as.numeric.data.V3. 4.297622
+ as.numeric.d1.V2. 5.455902
+ as.numeric.d1.V3. 4.312266
+
*/
val interceptR2 = 0.0
- val coefficientsR2 = Vectors.dense(5.477988, 4.297622)
+ val coefficientsR2 = Vectors.dense(5.455902, 4.312266)
- assert(model2.intercept ~== interceptR2 absTol 1E-3)
- assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
+ assert(model2.intercept ~== interceptR2 absTol 1E-2)
+ assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
model1.transform(datasetWithDenseFeature).select("features", "prediction")
.collect().foreach {
@@ -592,21 +594,47 @@ class LinearRegressionSuite
}
/*
- Use the following R code to generate model training results.
-
- predictions <- predict(fit, newx=features)
- residuals <- label - predictions
- > mean(residuals^2) # MSE
- [1] 0.009720325
- > mean(abs(residuals)) # MAD
- [1] 0.07863206
- > cor(predictions, label)^2# r^2
- [,1]
- s0 0.9998749
+ # Use the following R code to generate model training results.
+
+ # path/part-00000 is the file generated by running LinearDataGenerator.generateLinearInput
+ # as described before the beforeAll() method.
+ d1 <- read.csv("path/part-00000", header=FALSE, stringsAsFactors=FALSE)
+ fit <- glm(V1 ~ V2 + V3, data = d1, family = "gaussian")
+ names(f1)[1] = c("V2")
+ names(f1)[2] = c("V3")
+ f1 <- data.frame(as.numeric(d1$V2), as.numeric(d1$V3))
+ predictions <- predict(fit, newdata=f1)
+ l1 <- as.numeric(d1$V1)
+
+ residuals <- l1 - predictions
+ > mean(residuals^2) # MSE
+ [1] 0.00985449
+ > mean(abs(residuals)) # MAD
+ [1] 0.07961668
+ > cor(predictions, l1)^2 # r^2
+ [1] 0.9998737
+
+ > summary(fit)
+
+ Call:
+ glm(formula = V1 ~ V2 + V3, family = "gaussian", data = d1)
+
+ Deviance Residuals:
+ Min 1Q Median 3Q Max
+ -0.47082 -0.06797 0.00002 0.06725 0.34635
+
+ Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+ (Intercept) 6.3022157 0.0018600 3388 <2e-16 ***
+ V2 4.6982442 0.0011805 3980 <2e-16 ***
+ V3 7.1994344 0.0009044 7961 <2e-16 ***
+ ---
+
+ ....
*/
- assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
- assert(model.summary.meanAbsoluteError ~== 0.07863206 relTol 1E-5)
- assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+ assert(model.summary.meanSquaredError ~== 0.00985449 relTol 1E-4)
+ assert(model.summary.meanAbsoluteError ~== 0.07961668 relTol 1E-4)
+ assert(model.summary.r2 ~== 0.9998737 relTol 1E-4)
// Normal solver uses "WeightedLeastSquares". This algorithm does not generate
// objective history because it does not run through iterations.
@@ -621,14 +649,14 @@ class LinearRegressionSuite
// To clalify that the normal solver is used here.
assert(model.summary.objectiveHistory.length == 1)
assert(model.summary.objectiveHistory(0) == 0.0)
- val devianceResidualsR = Array(-0.35566, 0.34504)
- val seCoefR = Array(0.0011756, 0.0009032, 0.0018489)
- val tValsR = Array(3998, 7971, 3407)
+ val devianceResidualsR = Array(-0.47082, 0.34635)
+ val seCoefR = Array(0.0011805, 0.0009044, 0.0018600)
+ val tValsR = Array(3980, 7961, 3388)
val pValsR = Array(0, 0, 0)
model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x =>
- assert(x._1 ~== x._2 absTol 1E-5) }
+ assert(x._1 ~== x._2 absTol 1E-4) }
model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x =>
- assert(x._1 ~== x._2 absTol 1E-5) }
+ assert(x._1 ~== x._2 absTol 1E-4) }
model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) }
model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) }
}