[SPARK-13590][ML][DOC] Document spark.ml LiR, LoR and AFTSurvivalRegression behavior difference

## What changes were proposed in this pull request? When fitting ```LinearRegressionModel```(by "l-bfgs" solver) and ```LogisticRegressionModel``` w/o intercept on dataset with constant nonzero column, spark.ml produce same model as R glmnet but different from LIBSVM. When fitting ```AFTSurvivalRegressionModel``` w/o intercept on dataset with constant nonzero column, spark.ml produce different model compared with R survival::survreg. We should output a warning message and clarify in document for this condition. ## How was this patch tested? Document change, no unit test. cc mengxr Author: Yanbo Liang <ybliang8@gmail.com> Closes #12731 from yanboliang/spark-13590.
author: Yanbo Liang <ybliang8@gmail.com> 2016-06-07 15:25:36 -0700
committer: Yanbo Liang <ybliang8@gmail.com> 2016-06-07 15:25:36 -0700
commit: 6ecedf39b44c9acd58cdddf1a31cf11e8e24428c (patch)
tree: 480604299bd07f81c1166d80214b8a1433ff95fd /mllib
parent: 890baaca5078df0b50c0054f55a2c33023f7fd67 (diff)
download: spark-6ecedf39b44c9acd58cdddf1a31cf11e8e24428c.tar.gz
spark-6ecedf39b44c9acd58cdddf1a31cf11e8e24428c.tar.bz2
spark-6ecedf39b44c9acd58cdddf1a31cf11e8e24428c.zip
3 files changed, 22 insertions, 1 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1ea4d90e16..51ede15d6c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -333,6 +333,13 @@ class LogisticRegression @Since("1.2.0") (
         val featuresMean = summarizer.mean.toArray
         val featuresStd = summarizer.variance.toArray.map(math.sqrt)
 
+        if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
+          featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) {
+          logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
+            "constant nonzero column, Spark MLlib outputs zero coefficients for constant " +
+            "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.")
+        }
+
         val regParamL1 = $(elasticNetParam) * $(regParam)
         val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index c440073842..e5f23f44bc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -209,11 +209,18 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
     }
 
     val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val numFeatures = featuresStd.size
+
+    if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
+        featuresStd(i) == 0.0 && featuresSummarizer.mean(i) != 0.0 }) {
+      logWarning("Fitting AFTSurvivalRegressionModel without intercept on dataset with " +
+        "constant nonzero column, Spark MLlib outputs zero coefficients for constant nonzero " +
+        "columns. This behavior is different from R survival::survreg.")
+    }
 
     val costFun = new AFTCostFun(instances, $(fitIntercept), featuresStd)
     val optimizer = new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
 
-    val numFeatures = featuresStd.size
     /*
        The parameters vector has three parts:
        the first element: Double, log(sigma), the log of scale parameter
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 6be2584785..52ec40e15b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -267,6 +267,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
     val featuresMean = featuresSummarizer.mean.toArray
     val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
 
+    if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
+      featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) {
+      logWarning("Fitting LinearRegressionModel without intercept on dataset with " +
+        "constant nonzero column, Spark MLlib outputs zero coefficients for constant nonzero " +
+        "columns. This behavior is the same as R glmnet but different from LIBSVM.")
+    }
+
     // Since we implicitly do the feature scaling when we compute the cost function
     // to improve the convergence, the effective regParam will be changed.
     val effectiveRegParam = $(regParam) / yStd
author	Yanbo Liang <ybliang8@gmail.com>	2016-06-07 15:25:36 -0700
committer	Yanbo Liang <ybliang8@gmail.com>	2016-06-07 15:25:36 -0700
commit	6ecedf39b44c9acd58cdddf1a31cf11e8e24428c (patch)
tree	480604299bd07f81c1166d80214b8a1433ff95fd /mllib
parent	890baaca5078df0b50c0054f55a2c33023f7fd67 (diff)
download	spark-6ecedf39b44c9acd58cdddf1a31cf11e8e24428c.tar.gz spark-6ecedf39b44c9acd58cdddf1a31cf11e8e24428c.tar.bz2 spark-6ecedf39b44c9acd58cdddf1a31cf11e8e24428c.zip