From db56f2df1b8027171da1b8d2571d1f2ef1e103b6 Mon Sep 17 00:00:00 2001 From: Michael Giannakopoulos Date: Sun, 20 Jul 2014 20:48:44 -0700 Subject: [SPARK-1945][MLLIB] Documentation Improvements for Spark 1.0 Standalone application examples are added to 'mllib-linear-methods.md' file written in Java. This commit is related to the issue [Add full Java Examples in MLlib docs](https://issues.apache.org/jira/browse/SPARK-1945). Also I changed the name of the sigmoid function from 'logit' to 'f'. This is because the logit function is the inverse of sigmoid. Thanks, Michael Author: Michael Giannakopoulos Closes #1311 from miccagiann/master and squashes the following commits: 8ffe5ab [Michael Giannakopoulos] Update code so as to comply with code standards. f7ad5cc [Michael Giannakopoulos] Merge remote-tracking branch 'upstream/master' 38d92c7 [Michael Giannakopoulos] Adding PCA, SVD and LBFGS examples in Java. Performing minor updates in the already committed examples so as to eradicate the call of 'productElement' function whenever is possible. cc0a089 [Michael Giannakopoulos] Modyfied Java examples so as to comply with coding standards. b1141b2 [Michael Giannakopoulos] Added Java examples for Clustering and Collaborative Filtering [mllib-clustering.md & mllib-collaborative-filtering.md]. 837f7a8 [Michael Giannakopoulos] Merge remote-tracking branch 'upstream/master' 15f0eb4 [Michael Giannakopoulos] Java examples included in 'mllib-linear-methods.md' file. --- docs/mllib-dimensionality-reduction.md | 94 ++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'docs/mllib-dimensionality-reduction.md') diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md index e3608075fb..8e434998c1 100644 --- a/docs/mllib-dimensionality-reduction.md +++ b/docs/mllib-dimensionality-reduction.md @@ -57,10 +57,57 @@ val U: RowMatrix = svd.U // The U factor is a RowMatrix. val s: Vector = svd.s // The singular values are stored in a local dense vector. val V: Matrix = svd.V // The V factor is a local dense matrix. {% endhighlight %} + +Same code applies to `IndexedRowMatrix`. +The only difference that the `U` matrix becomes an `IndexedRowMatrix`. +
+In order to run the following standalone application using Spark framework make +sure that you follow the instructions provided at section [Standalone +Applications](quick-start.html) of the quick-start guide. What is more, you +should include to your build file *spark-mllib* as a dependency. + +{% highlight java %} +import java.util.LinkedList; + +import org.apache.spark.api.java.*; +import org.apache.spark.mllib.linalg.distributed.RowMatrix; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.SingularValueDecomposition; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.rdd.RDD; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; + +public class SVD { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("SVD Example"); + SparkContext sc = new SparkContext(conf); + + double[][] array = ... + LinkedList rowsList = new LinkedList(); + for (int i = 0; i < array.length; i++) { + Vector currentRow = Vectors.dense(array[i]); + rowsList.add(currentRow); + } + JavaRDD rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList); + + // Create a RowMatrix from JavaRDD. + RowMatrix mat = new RowMatrix(rows.rdd()); + + // Compute the top 4 singular values and corresponding singular vectors. + SingularValueDecomposition svd = mat.computeSVD(4, true, 1.0E-9d); + RowMatrix U = svd.U(); + Vector s = svd.s(); + Matrix V = svd.V(); + } +} +{% endhighlight %} Same code applies to `IndexedRowMatrix`. The only difference that the `U` matrix becomes an `IndexedRowMatrix`.
+ ## Principal component analysis (PCA) @@ -91,4 +138,51 @@ val pc: Matrix = mat.computePrincipalComponents(10) // Principal components are val projected: RowMatrix = mat.multiply(pc) {% endhighlight %} + +
+ +The following code demonstrates how to compute principal components on a tall-and-skinny `RowMatrix` +and use them to project the vectors into a low-dimensional space. +The number of columns should be small, e.g, less than 1000. + +{% highlight java %} +import java.util.LinkedList; + +import org.apache.spark.api.java.*; +import org.apache.spark.mllib.linalg.distributed.RowMatrix; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.rdd.RDD; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; + +public class PCA { + public static void main(String[] args) { + SparkConf conf = new SparkConf().setAppName("PCA Example"); + SparkContext sc = new SparkContext(conf); + + double[][] array = ... + LinkedList rowsList = new LinkedList(); + for (int i = 0; i < array.length; i++) { + Vector currentRow = Vectors.dense(array[i]); + rowsList.add(currentRow); + } + JavaRDD rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList); + + // Create a RowMatrix from JavaRDD. + RowMatrix mat = new RowMatrix(rows.rdd()); + + // Compute the top 3 principal components. + Matrix pc = mat.computePrincipalComponents(3); + RowMatrix projected = mat.multiply(pc); + } +} +{% endhighlight %} + +In order to run the above standalone application using Spark framework make +sure that you follow the instructions provided at section [Standalone +Applications](quick-start.html) of the quick-start guide. What is more, you +should include to your build file *spark-mllib* as a dependency. +
-- cgit v1.2.3