aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-optimization.md
diff options
context:
space:
mode:
authorMichael Giannakopoulos <miccagiann@gmail.com>2014-07-20 20:48:44 -0700
committerXiangrui Meng <meng@databricks.com>2014-07-20 20:48:44 -0700
commitdb56f2df1b8027171da1b8d2571d1f2ef1e103b6 (patch)
treec386e760532b3754d28f14999288fb051824a5b9 /docs/mllib-optimization.md
parentf6e7302cb49ee227aed537026d19f68528051dfd (diff)
downloadspark-db56f2df1b8027171da1b8d2571d1f2ef1e103b6.tar.gz
spark-db56f2df1b8027171da1b8d2571d1f2ef1e103b6.tar.bz2
spark-db56f2df1b8027171da1b8d2571d1f2ef1e103b6.zip
[SPARK-1945][MLLIB] Documentation Improvements for Spark 1.0
Standalone application examples are added to 'mllib-linear-methods.md' file written in Java. This commit is related to the issue [Add full Java Examples in MLlib docs](https://issues.apache.org/jira/browse/SPARK-1945). Also I changed the name of the sigmoid function from 'logit' to 'f'. This is because the logit function is the inverse of sigmoid. Thanks, Michael Author: Michael Giannakopoulos <miccagiann@gmail.com> Closes #1311 from miccagiann/master and squashes the following commits: 8ffe5ab [Michael Giannakopoulos] Update code so as to comply with code standards. f7ad5cc [Michael Giannakopoulos] Merge remote-tracking branch 'upstream/master' 38d92c7 [Michael Giannakopoulos] Adding PCA, SVD and LBFGS examples in Java. Performing minor updates in the already committed examples so as to eradicate the call of 'productElement' function whenever is possible. cc0a089 [Michael Giannakopoulos] Modyfied Java examples so as to comply with coding standards. b1141b2 [Michael Giannakopoulos] Added Java examples for Clustering and Collaborative Filtering [mllib-clustering.md & mllib-collaborative-filtering.md]. 837f7a8 [Michael Giannakopoulos] Merge remote-tracking branch 'upstream/master' 15f0eb4 [Michael Giannakopoulos] Java examples included in 'mllib-linear-methods.md' file.
Diffstat (limited to 'docs/mllib-optimization.md')
-rw-r--r--docs/mllib-optimization.md96
1 files changed, 95 insertions, 1 deletions
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md
index 651958c781..26ce5f3c50 100644
--- a/docs/mllib-optimization.md
+++ b/docs/mllib-optimization.md
@@ -207,6 +207,10 @@ the loss computed for every iteration.
Here is an example to train binary logistic regression with L2 regularization using
L-BFGS optimizer.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
{% highlight scala %}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
@@ -263,7 +267,97 @@ println("Loss of each step in training process")
loss.foreach(println)
println("Area under ROC = " + auROC)
{% endhighlight %}
-
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import java.util.Arrays;
+import java.util.Random;
+
+import scala.Tuple2;
+
+import org.apache.spark.api.java.*;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.classification.LogisticRegressionModel;
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.optimization.*;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+
+public class LBFGSExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("L-BFGS Example");
+ SparkContext sc = new SparkContext(conf);
+ String path = "data/mllib/sample_libsvm_data.txt";
+ JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
+ int numFeatures = data.take(1).get(0).features().size();
+
+ // Split initial RDD into two... [60% training data, 40% testing data].
+ JavaRDD<LabeledPoint> trainingInit = data.sample(false, 0.6, 11L);
+ JavaRDD<LabeledPoint> test = data.subtract(trainingInit);
+
+ // Append 1 into the training data as intercept.
+ JavaRDD<Tuple2<Object, Vector>> training = data.map(
+ new Function<LabeledPoint, Tuple2<Object, Vector>>() {
+ public Tuple2<Object, Vector> call(LabeledPoint p) {
+ return new Tuple2<Object, Vector>(p.label(), MLUtils.appendBias(p.features()));
+ }
+ });
+ training.cache();
+
+ // Run training algorithm to build the model.
+ int numCorrections = 10;
+ double convergenceTol = 1e-4;
+ int maxNumIterations = 20;
+ double regParam = 0.1;
+ Vector initialWeightsWithIntercept = Vectors.dense(new double[numFeatures + 1]);
+
+ Tuple2<Vector, double[]> result = LBFGS.runLBFGS(
+ training.rdd(),
+ new LogisticGradient(),
+ new SquaredL2Updater(),
+ numCorrections,
+ convergenceTol,
+ maxNumIterations,
+ regParam,
+ initialWeightsWithIntercept);
+ Vector weightsWithIntercept = result._1();
+ double[] loss = result._2();
+
+ final LogisticRegressionModel model = new LogisticRegressionModel(
+ Vectors.dense(Arrays.copyOf(weightsWithIntercept.toArray(), weightsWithIntercept.size() - 1)),
+ (weightsWithIntercept.toArray())[weightsWithIntercept.size() - 1]);
+
+ // Clear the default threshold.
+ model.clearThreshold();
+
+ // Compute raw scores on the test set.
+ JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
+ new Function<LabeledPoint, Tuple2<Object, Object>>() {
+ public Tuple2<Object, Object> call(LabeledPoint p) {
+ Double score = model.predict(p.features());
+ return new Tuple2<Object, Object>(score, p.label());
+ }
+ });
+
+ // Get evaluation metrics.
+ BinaryClassificationMetrics metrics =
+ new BinaryClassificationMetrics(scoreAndLabels.rdd());
+ double auROC = metrics.areaUnderROC();
+
+ System.out.println("Loss of each step in training process");
+ for (double l : loss)
+ System.out.println(l);
+ System.out.println("Area under ROC = " + auROC);
+ }
+}
+{% endhighlight %}
+</div>
+</div>
#### Developer's note
Since the Hessian is constructed approximately from previous gradient evaluations,
the objective function can not be changed during the optimization process.