diff options
author | DB Tsai <dbt@netflix.com> | 2015-07-07 15:46:44 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2015-07-07 15:46:44 -0700 |
commit | 3bf20c27ff3cb3a32bfc3a44e08a57865957c117 (patch) | |
tree | a40ab74ff4d59eee13206380abb549553639667d /mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala | |
parent | 35d781e71b68eb6da7f49fdae40fa6c4f8e27060 (diff) | |
download | spark-3bf20c27ff3cb3a32bfc3a44e08a57865957c117.tar.gz spark-3bf20c27ff3cb3a32bfc3a44e08a57865957c117.tar.bz2 spark-3bf20c27ff3cb3a32bfc3a44e08a57865957c117.zip |
[SPARK-8845] [ML] ML use of Breeze optimization: use adjustedValue instead of value
In LinearRegression and LogisticRegression, we use Breeze's optimizers (LBFGS and OWLQN). We check the State.value to see the current objective. However, Breeze's documentation makes it sound like value and adjustedValue differ for some optimizers, possibly including OWLQN: https://github.com/scalanlp/breeze/blob/26faf622862e8d7a42a401aef601347aac655f2b/math/src/main/scala/breeze/optimize/FirstOrderMinimizer.scala#L36
If that is the case, then we should use adjustedValue instead of value. This is relevant to SPARK-8538 and SPARK-8539, where we will provide the objective trace to the user.
Author: DB Tsai <dbt@netflix.com>
Closes #7245 from dbtsai/SPARK-8845 and squashes the following commits:
fa4c91e [DB Tsai] address feedback
e6caac1 [DB Tsai] java style multiline comment
b10c574 [DB Tsai] address feedback
c9ff81e [DB Tsai] first commit
Diffstat (limited to 'mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala | 83 |
1 files changed, 44 insertions, 39 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2e6eedd45a..3967151f76 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -116,7 +116,7 @@ class LogisticRegression(override val uid: String) case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: MultiClassSummarizer), (label: Double, features: Vector)) => (summarizer.add(features), labelSummarizer.add(label)) - }, + }, combOp = (c1, c2) => (c1, c2) match { case ((summarizer1: MultivariateOnlineSummarizer, classSummarizer1: MultiClassSummarizer), (summarizer2: MultivariateOnlineSummarizer, @@ -166,18 +166,18 @@ class LogisticRegression(override val uid: String) Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures) if ($(fitIntercept)) { - /** - * For binary logistic regression, when we initialize the weights as zeros, - * it will converge faster if we initialize the intercept such that - * it follows the distribution of the labels. - * - * {{{ - * P(0) = 1 / (1 + \exp(b)), and - * P(1) = \exp(b) / (1 + \exp(b)) - * }}}, hence - * {{{ - * b = \log{P(1) / P(0)} = \log{count_1 / count_0} - * }}} + /* + For binary logistic regression, when we initialize the weights as zeros, + it will converge faster if we initialize the intercept such that + it follows the distribution of the labels. + + {{{ + P(0) = 1 / (1 + \exp(b)), and + P(1) = \exp(b) / (1 + \exp(b)) + }}}, hence + {{{ + b = \log{P(1) / P(0)} = \log{count_1 / count_0} + }}} */ initialWeightsWithIntercept.toArray(numFeatures) = math.log(histogram(1).toDouble / histogram(0).toDouble) @@ -186,39 +186,48 @@ class LogisticRegression(override val uid: String) val states = optimizer.iterations(new CachedDiffFunction(costFun), initialWeightsWithIntercept.toBreeze.toDenseVector) - var state = states.next() - val lossHistory = mutable.ArrayBuilder.make[Double] + val (weights, intercept, objectiveHistory) = { + /* + Note that in Logistic Regression, the objective history (loss + regularization) + is log-likelihood which is invariance under feature standardization. As a result, + the objective history from optimizer is the same as the one in the original space. + */ + val arrayBuilder = mutable.ArrayBuilder.make[Double] + var state: optimizer.State = null + while (states.hasNext) { + state = states.next() + arrayBuilder += state.adjustedValue + } - while (states.hasNext) { - lossHistory += state.value - state = states.next() - } - lossHistory += state.value + if (state == null) { + val msg = s"${optimizer.getClass.getName} failed." + logError(msg) + throw new SparkException(msg) + } - // The weights are trained in the scaled space; we're converting them back to - // the original space. - val weightsWithIntercept = { + /* + The weights are trained in the scaled space; we're converting them back to + the original space. + Note that the intercept in scaled space and original space is the same; + as a result, no scaling is needed. + */ val rawWeights = state.x.toArray.clone() var i = 0 - // Note that the intercept in scaled space and original space is the same; - // as a result, no scaling is needed. while (i < numFeatures) { rawWeights(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } i += 1 } - Vectors.dense(rawWeights) + + if ($(fitIntercept)) { + (Vectors.dense(rawWeights.dropRight(1)).compressed, rawWeights.last, arrayBuilder.result()) + } else { + (Vectors.dense(rawWeights).compressed, 0.0, arrayBuilder.result()) + } } if (handlePersistence) instances.unpersist() - val (weights, intercept) = if ($(fitIntercept)) { - (Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)), - weightsWithIntercept(weightsWithIntercept.size - 1)) - } else { - (weightsWithIntercept, 0.0) - } - - new LogisticRegressionModel(uid, weights.compressed, intercept) + copyValues(new LogisticRegressionModel(uid, weights, intercept)) } override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra) @@ -423,16 +432,12 @@ private class LogisticAggregator( require(dim == data.size, s"Dimensions mismatch when adding new sample." + s" Expecting $dim but got ${data.size}.") - val dataSize = data.size - val localWeightsArray = weightsArray val localGradientSumArray = gradientSumArray numClasses match { case 2 => - /** - * For Binary Logistic Regression. - */ + // For Binary Logistic Regression. val margin = - { var sum = 0.0 data.foreachActive { (index, value) => |