diff options
author | Michael Giannakopoulos <miccagiann@gmail.com> | 2014-07-20 20:48:44 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-07-20 20:48:44 -0700 |
commit | db56f2df1b8027171da1b8d2571d1f2ef1e103b6 (patch) | |
tree | c386e760532b3754d28f14999288fb051824a5b9 /docs/mllib-optimization.md | |
parent | f6e7302cb49ee227aed537026d19f68528051dfd (diff) | |
download | spark-db56f2df1b8027171da1b8d2571d1f2ef1e103b6.tar.gz spark-db56f2df1b8027171da1b8d2571d1f2ef1e103b6.tar.bz2 spark-db56f2df1b8027171da1b8d2571d1f2ef1e103b6.zip |
[SPARK-1945][MLLIB] Documentation Improvements for Spark 1.0
Standalone application examples are added to 'mllib-linear-methods.md' file written in Java.
This commit is related to the issue [Add full Java Examples in MLlib docs](https://issues.apache.org/jira/browse/SPARK-1945).
Also I changed the name of the sigmoid function from 'logit' to 'f'. This is because the logit function
is the inverse of sigmoid.
Thanks,
Michael
Author: Michael Giannakopoulos <miccagiann@gmail.com>
Closes #1311 from miccagiann/master and squashes the following commits:
8ffe5ab [Michael Giannakopoulos] Update code so as to comply with code standards.
f7ad5cc [Michael Giannakopoulos] Merge remote-tracking branch 'upstream/master'
38d92c7 [Michael Giannakopoulos] Adding PCA, SVD and LBFGS examples in Java. Performing minor updates in the already committed examples so as to eradicate the call of 'productElement' function whenever is possible.
cc0a089 [Michael Giannakopoulos] Modyfied Java examples so as to comply with coding standards.
b1141b2 [Michael Giannakopoulos] Added Java examples for Clustering and Collaborative Filtering [mllib-clustering.md & mllib-collaborative-filtering.md].
837f7a8 [Michael Giannakopoulos] Merge remote-tracking branch 'upstream/master'
15f0eb4 [Michael Giannakopoulos] Java examples included in 'mllib-linear-methods.md' file.
Diffstat (limited to 'docs/mllib-optimization.md')
-rw-r--r-- | docs/mllib-optimization.md | 96 |
1 files changed, 95 insertions, 1 deletions
diff --git a/docs/mllib-optimization.md b/docs/mllib-optimization.md index 651958c781..26ce5f3c50 100644 --- a/docs/mllib-optimization.md +++ b/docs/mllib-optimization.md @@ -207,6 +207,10 @@ the loss computed for every iteration. Here is an example to train binary logistic regression with L2 regularization using L-BFGS optimizer. + +<div class="codetabs"> + +<div data-lang="scala" markdown="1"> {% highlight scala %} import org.apache.spark.SparkContext import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics @@ -263,7 +267,97 @@ println("Loss of each step in training process") loss.foreach(println) println("Area under ROC = " + auROC) {% endhighlight %} - +</div> + +<div data-lang="java" markdown="1"> +{% highlight java %} +import java.util.Arrays; +import java.util.Random; + +import scala.Tuple2; + +import org.apache.spark.api.java.*; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.classification.LogisticRegressionModel; +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.optimization.*; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; + +public class LBFGSExample { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("L-BFGS Example"); + SparkContext sc = new SparkContext(conf); + String path = "data/mllib/sample_libsvm_data.txt"; + JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); + int numFeatures = data.take(1).get(0).features().size(); + + // Split initial RDD into two... [60% training data, 40% testing data]. + JavaRDD<LabeledPoint> trainingInit = data.sample(false, 0.6, 11L); + JavaRDD<LabeledPoint> test = data.subtract(trainingInit); + + // Append 1 into the training data as intercept. + JavaRDD<Tuple2<Object, Vector>> training = data.map( + new Function<LabeledPoint, Tuple2<Object, Vector>>() { + public Tuple2<Object, Vector> call(LabeledPoint p) { + return new Tuple2<Object, Vector>(p.label(), MLUtils.appendBias(p.features())); + } + }); + training.cache(); + + // Run training algorithm to build the model. + int numCorrections = 10; + double convergenceTol = 1e-4; + int maxNumIterations = 20; + double regParam = 0.1; + Vector initialWeightsWithIntercept = Vectors.dense(new double[numFeatures + 1]); + + Tuple2<Vector, double[]> result = LBFGS.runLBFGS( + training.rdd(), + new LogisticGradient(), + new SquaredL2Updater(), + numCorrections, + convergenceTol, + maxNumIterations, + regParam, + initialWeightsWithIntercept); + Vector weightsWithIntercept = result._1(); + double[] loss = result._2(); + + final LogisticRegressionModel model = new LogisticRegressionModel( + Vectors.dense(Arrays.copyOf(weightsWithIntercept.toArray(), weightsWithIntercept.size() - 1)), + (weightsWithIntercept.toArray())[weightsWithIntercept.size() - 1]); + + // Clear the default threshold. + model.clearThreshold(); + + // Compute raw scores on the test set. + JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map( + new Function<LabeledPoint, Tuple2<Object, Object>>() { + public Tuple2<Object, Object> call(LabeledPoint p) { + Double score = model.predict(p.features()); + return new Tuple2<Object, Object>(score, p.label()); + } + }); + + // Get evaluation metrics. + BinaryClassificationMetrics metrics = + new BinaryClassificationMetrics(scoreAndLabels.rdd()); + double auROC = metrics.areaUnderROC(); + + System.out.println("Loss of each step in training process"); + for (double l : loss) + System.out.println(l); + System.out.println("Area under ROC = " + auROC); + } +} +{% endhighlight %} +</div> +</div> #### Developer's note Since the Hessian is constructed approximately from previous gradient evaluations, the objective function can not be changed during the optimization process. |