aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFeynman Liang <fliang@databricks.com>2015-08-27 21:55:20 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-27 21:55:28 -0700
commitede8c625cd6631d54961c3b39996e3c60bc08be4 (patch)
tree6818a39fef93fa444146c9fbce238047b76e9301
parent6ccc0df8e416993730e5c6550a98cb6f2187a914 (diff)
downloadspark-ede8c625cd6631d54961c3b39996e3c60bc08be4.tar.gz
spark-ede8c625cd6631d54961c3b39996e3c60bc08be4.tar.bz2
spark-ede8c625cd6631d54961c3b39996e3c60bc08be4.zip
[SPARK-9905] [ML] [DOC] Adds LinearRegressionSummary user guide
* Adds user guide for `LinearRegressionSummary` * Fixes unresolved issues in #8197 CC jkbradley mengxr Author: Feynman Liang <fliang@databricks.com> Closes #8491 from feynmanliang/SPARK-9905. (cherry picked from commit af0e1249b1c881c0fa7a921fd21fd2c27214b980) Signed-off-by: Xiangrui Meng <meng@databricks.com>
-rw-r--r--docs/ml-linear-methods.md140
1 files changed, 127 insertions, 13 deletions
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 2761aeb789..cdd9d4999f 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -34,7 +34,7 @@ net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf).
Mathematically, it is defined as a convex combination of the $L_1$ and
the $L_2$ regularization terms:
`\[
-\alpha~\lambda \|\wv\|_1 + (1-\alpha) \frac{\lambda}{2}\|\wv\|_2^2, \alpha \in [0, 1], \lambda \geq 0.
+\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0
\]`
By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$
regularization as special cases. For example, if a [linear
@@ -95,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
SparkContext sc = new SparkContext(conf);
SQLContext sql = new SQLContext(sc);
- String path = "sample_libsvm_data.txt";
+ String path = "data/mllib/sample_libsvm_data.txt";
// Load training data
DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
@@ -103,7 +103,7 @@ public class LogisticRegressionWithElasticNetExample {
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.3)
- .setElasticNetParam(0.8)
+ .setElasticNetParam(0.8);
// Fit the model
LogisticRegressionModel lrModel = lr.fit(training);
@@ -158,10 +158,12 @@ This will likely change when multiclass classification is supported.
Continuing the earlier example:
{% highlight scala %}
+import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary
+
// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
val trainingSummary = lrModel.summary
-// Obtain the loss per iteration.
+// Obtain the objective per iteration.
val objectiveHistory = trainingSummary.objectiveHistory
objectiveHistory.foreach(loss => println(loss))
@@ -173,17 +175,14 @@ val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary
// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
val roc = binarySummary.roc
roc.show()
-roc.select("FPR").show()
println(binarySummary.areaUnderROC)
-// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
-// this selected threshold.
+// Set the model threshold to maximize F-Measure
val fMeasure = binarySummary.fMeasureByThreshold
val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).
select("threshold").head().getDouble(0)
-logReg.setThreshold(bestThreshold)
-logReg.fit(logRegDataFrame)
+lrModel.setThreshold(bestThreshold)
{% endhighlight %}
</div>
@@ -199,8 +198,12 @@ This will likely change when multiclass classification is supported.
Continuing the earlier example:
{% highlight java %}
+import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
+import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary;
+import org.apache.spark.sql.functions;
+
// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
-LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary();
+LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
// Obtain the loss per iteration.
double[] objectiveHistory = trainingSummary.objectiveHistory();
@@ -222,20 +225,131 @@ System.out.println(binarySummary.areaUnderROC());
// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
// this selected threshold.
DataFrame fMeasure = binarySummary.fMeasureByThreshold();
-double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0);
+double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0);
double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)).
select("threshold").head().getDouble(0);
-logReg.setThreshold(bestThreshold);
-logReg.fit(logRegDataFrame);
+lrModel.setThreshold(bestThreshold);
{% endhighlight %}
</div>
+<!--- TODO: Add python model summaries once implemented -->
<div data-lang="python" markdown="1">
Logistic regression model summary is not yet supported in Python.
</div>
</div>
+## Example: Linear Regression
+
+The interface for working with linear regression models and model
+summaries is similar to the logistic regression case. The following
+example demonstrates training an elastic net regularized linear
+regression model and extracting model summary statistics.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.mllib.util.MLUtils
+
+// Load training data
+val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+val lr = new LinearRegression()
+ .setMaxIter(10)
+ .setRegParam(0.3)
+ .setElasticNetParam(0.8)
+
+// Fit the model
+val lrModel = lr.fit(training)
+
+// Print the weights and intercept for linear regression
+println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
+
+// Summarize the model over the training set and print out some metrics
+val trainingSummary = lrModel.summary
+println(s"numIterations: ${trainingSummary.totalIterations}")
+println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
+trainingSummary.residuals.show()
+println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
+println(s"r2: ${trainingSummary.r2}")
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.regression.LinearRegressionModel;
+import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class LinearRegressionWithElasticNetExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf()
+ .setAppName("Linear Regression with Elastic Net Example");
+
+ SparkContext sc = new SparkContext(conf);
+ SQLContext sql = new SQLContext(sc);
+ String path = "data/mllib/sample_libsvm_data.txt";
+
+ // Load training data
+ DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+
+ LinearRegression lr = new LinearRegression()
+ .setMaxIter(10)
+ .setRegParam(0.3)
+ .setElasticNetParam(0.8);
+
+ // Fit the model
+ LinearRegressionModel lrModel = lr.fit(training);
+
+ // Print the weights and intercept for linear regression
+ System.out.println("Weights: " + lrModel.weights() + " Intercept: " + lrModel.intercept());
+
+ // Summarize the model over the training set and print out some metrics
+ LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
+ System.out.println("numIterations: " + trainingSummary.totalIterations());
+ System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
+ trainingSummary.residuals().show();
+ System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
+ System.out.println("r2: " + trainingSummary.r2());
+ }
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+<!--- TODO: Add python model summaries once implemented -->
+{% highlight python %}
+from pyspark.ml.regression import LinearRegression
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
+
+# Load training data
+training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
+
+# Fit the model
+lrModel = lr.fit(training)
+
+# Print the weights and intercept for linear regression
+print("Weights: " + str(lrModel.weights))
+print("Intercept: " + str(lrModel.intercept))
+
+# Linear regression model summary is not yet supported in Python.
+{% endhighlight %}
+</div>
+
+</div>
+
# Optimization
The optimization algorithm underlying the implementation is called