[SPARK-13016][DOCUMENTATION] Replace example code in mllib-dimensionality-reduction.md using include_example

Replaced example example code in mllib-dimensionality-reduction.md using include_example Author: Devaraj K <devaraj@apache.org> Closes #11132 from devaraj-kavali/SPARK-13016.
author: Devaraj K <devaraj@apache.org> 2016-02-22 17:16:56 -0800
committer: Xiangrui Meng <meng@databricks.com> 2016-02-22 17:16:56 -0800
commit: 9f410871ca03f4c04bd965b2e4f80167ce543139 (patch)
tree: 8c04aa65938c5dbcea96de42463b625ccc0ef313
parent: 2063781840831469b394313694bfd25cbde2bb1e (diff)
download: spark-9f410871ca03f4c04bd965b2e4f80167ce543139.tar.gz
spark-9f410871ca03f4c04bd965b2e4f80167ce543139.tar.bz2
spark-9f410871ca03f4c04bd965b2e4f80167ce543139.zip
6 files changed, 316 insertions, 108 deletions
diff --git a/docs/mllib-dimensionality-reduction.md b/docs/mllib-dimensionality-reduction.md
index 11d8e0bd1d..cceddce9f7 100644
--- a/docs/mllib-dimensionality-reduction.md
+++ b/docs/mllib-dimensionality-reduction.md
@@ -64,19 +64,7 @@ passes, $O(n)$ storage on each executor, and $O(n k)$ storage on the driver.
 <div data-lang="scala" markdown="1">
 Refer to the [`SingularValueDecomposition` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.SingularValueDecomposition) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Matrix
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
-import org.apache.spark.mllib.linalg.SingularValueDecomposition
-
-val mat: RowMatrix = ...
-
-// Compute the top 20 singular values and corresponding singular vectors.
-val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(20, computeU = true)
-val U: RowMatrix = svd.U // The U factor is a RowMatrix.
-val s: Vector = svd.s // The singular values are stored in a local dense vector.
-val V: Matrix = svd.V // The V factor is a local dense matrix.
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/SVDExample.scala %}
 
 The same code applies to `IndexedRowMatrix` if `U` is defined as an
 `IndexedRowMatrix`.
@@ -84,43 +72,7 @@ The same code applies to `IndexedRowMatrix` if `U` is defined as an
 <div data-lang="java" markdown="1">
 Refer to the [`SingularValueDecomposition` Java docs](api/java/org/apache/spark/mllib/linalg/SingularValueDecomposition.html) for details on the API.
 
-{% highlight java %}
-import java.util.LinkedList;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.SingularValueDecomposition;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-
-public class SVD {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("SVD Example");
-    SparkContext sc = new SparkContext(conf);
-     
-    double[][] array = ...
-    LinkedList<Vector> rowsList = new LinkedList<Vector>();
-    for (int i = 0; i < array.length; i++) {
-      Vector currentRow = Vectors.dense(array[i]);
-      rowsList.add(currentRow);
-    }
-    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
-
-    // Create a RowMatrix from JavaRDD<Vector>.
-    RowMatrix mat = new RowMatrix(rows.rdd());
-
-    // Compute the top 4 singular values and corresponding singular vectors.
-    SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(4, true, 1.0E-9d);
-    RowMatrix U = svd.U();
-    Vector s = svd.s();
-    Matrix V = svd.V();
-  }
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaSVDExample.java %}
 
 The same code applies to `IndexedRowMatrix` if `U` is defined as an
 `IndexedRowMatrix`.
@@ -151,36 +103,14 @@ and use them to project the vectors into a low-dimensional space.
 
 Refer to the [`RowMatrix` Scala docs](api/scala/index.html#org.apache.spark.mllib.linalg.distributed.RowMatrix) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Matrix
-import org.apache.spark.mllib.linalg.distributed.RowMatrix
-
-val mat: RowMatrix = ...
-
-// Compute the top 10 principal components.
-val pc: Matrix = mat.computePrincipalComponents(10) // Principal components are stored in a local dense matrix.
-
-// Project the rows to the linear space spanned by the top 10 principal components.
-val projected: RowMatrix = mat.multiply(pc)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala %}
 
 The following code demonstrates how to compute principal components on source vectors
 and use them to project the vectors into a low-dimensional space while keeping associated labels:
 
 Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.feature.PCA
-
-val data: RDD[LabeledPoint] = ...
-
-// Compute the top 10 principal components.
-val pca = new PCA(10).fit(data.map(_.features))
-
-// Project vectors to the linear space spanned by the top 10 principal components, keeping the label
-val projected = data.map(p => p.copy(features = pca.transform(p.features)))
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/PCAOnSourceVectorExample.scala %}
 
 </div>
 
@@ -192,40 +122,7 @@ The number of columns should be small, e.g, less than 1000.
 
 Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/distributed/RowMatrix.html) for details on the API.
 
-{% highlight java %}
-import java.util.LinkedList;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.mllib.linalg.distributed.RowMatrix;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.rdd.RDD;
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-
-public class PCA {
-  public static void main(String[] args) {
-    SparkConf conf = new SparkConf().setAppName("PCA Example");
-    SparkContext sc = new SparkContext(conf);
-     
-    double[][] array = ...
-    LinkedList<Vector> rowsList = new LinkedList<Vector>();
-    for (int i = 0; i < array.length; i++) {
-      Vector currentRow = Vectors.dense(array[i]);
-      rowsList.add(currentRow);
-    }
-    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
-
-    // Create a RowMatrix from JavaRDD<Vector>.
-    RowMatrix mat = new RowMatrix(rows.rdd());
-
-    // Compute the top 3 principal components.
-    Matrix pc = mat.computePrincipalComponents(3);
-    RowMatrix projected = mat.multiply(pc);
-  }
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaPCAExample.java %}
 
 </div>
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java
new file mode 100644
index 0000000000..faf76a9540
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaPCAExample.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.LinkedList;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+// $example off$
+
+/**
+ * Example for compute principal components on a 'RowMatrix'.
+ */
+public class JavaPCAExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("PCA Example");
+    SparkContext sc = new SparkContext(conf);
+
+    // $example on$
+    double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
+    LinkedList<Vector> rowsList = new LinkedList<Vector>();
+    for (int i = 0; i < array.length; i++) {
+      Vector currentRow = Vectors.dense(array[i]);
+      rowsList.add(currentRow);
+    }
+    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
+
+    // Create a RowMatrix from JavaRDD<Vector>.
+    RowMatrix mat = new RowMatrix(rows.rdd());
+
+    // Compute the top 3 principal components.
+    Matrix pc = mat.computePrincipalComponents(3);
+    RowMatrix projected = mat.multiply(pc);
+    // $example off$
+    Vector[] collectPartitions = (Vector[])projected.rows().collect();
+    System.out.println("Projected vector of principal component:");
+    for (Vector vector : collectPartitions) {
+      System.out.println("\t" + vector);
+    }
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java
new file mode 100644
index 0000000000..f3685db9f2
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaSVDExample.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.LinkedList;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Matrix;
+import org.apache.spark.mllib.linalg.SingularValueDecomposition;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.linalg.distributed.RowMatrix;
+// $example off$
+
+/**
+ * Example for SingularValueDecomposition.
+ */
+public class JavaSVDExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("SVD Example");
+    SparkContext sc = new SparkContext(conf);
+
+    // $example on$
+    double[][] array = {{1.12, 2.05, 3.12}, {5.56, 6.28, 8.94}, {10.2, 8.0, 20.5}};
+    LinkedList<Vector> rowsList = new LinkedList<Vector>();
+    for (int i = 0; i < array.length; i++) {
+      Vector currentRow = Vectors.dense(array[i]);
+      rowsList.add(currentRow);
+    }
+    JavaRDD<Vector> rows = JavaSparkContext.fromSparkContext(sc).parallelize(rowsList);
+
+    // Create a RowMatrix from JavaRDD<Vector>.
+    RowMatrix mat = new RowMatrix(rows.rdd());
+
+    // Compute the top 3 singular values and corresponding singular vectors.
+    SingularValueDecomposition<RowMatrix, Matrix> svd = mat.computeSVD(3, true, 1.0E-9d);
+    RowMatrix U = svd.U();
+    Vector s = svd.s();
+    Matrix V = svd.V();
+    // $example off$
+    Vector[] collectPartitions = (Vector[]) U.rows().collect();
+    System.out.println("U factor is:");
+    for (Vector vector : collectPartitions) {
+      System.out.println("\t" + vector);
+    }
+    System.out.println("Singular values are: " + s);
+    System.out.println("V factor is:\n" + V);
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala
new file mode 100644
index 0000000000..234de230eb
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnRowMatrixExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.linalg.Matrix
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+// $example off$
+
+object PCAOnRowMatrixExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("PCAOnRowMatrixExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data = Array(
+      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+
+    val dataRDD = sc.parallelize(data, 2)
+
+    val mat: RowMatrix = new RowMatrix(dataRDD)
+
+    // Compute the top 4 principal components.
+    // Principal components are stored in a local dense matrix.
+    val pc: Matrix = mat.computePrincipalComponents(4)
+
+    // Project the rows to the linear space spanned by the top 4 principal components.
+    val projected: RowMatrix = mat.multiply(pc)
+    // $example off$
+    val collect = projected.rows.collect()
+    println("Projected Row Matrix of principal component:")
+    collect.foreach { vector => println(vector) }
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnSourceVectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnSourceVectorExample.scala
new file mode 100644
index 0000000000..f7694879df
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAOnSourceVectorExample.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+// $example off$
+
+object PCAOnSourceVectorExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
+      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
+      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
+      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
+      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
+      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))
+
+    // Compute the top 5 principal components.
+    val pca = new PCA(5).fit(data.map(_.features))
+
+    // Project vectors to the linear space spanned by the top 5 principal
+    // components, keeping the label
+    val projected = data.map(p => p.copy(features = pca.transform(p.features)))
+    // $example off$
+    val collect = projected.collect()
+    println("Projected vector of principal component:")
+    collect.foreach { vector => println(vector) }
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala
new file mode 100644
index 0000000000..c26580d4c1
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVDExample.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.linalg.Matrix
+import org.apache.spark.mllib.linalg.SingularValueDecomposition
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+// $example off$
+
+object SVDExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("SVDExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data = Array(
+      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+
+    val dataRDD = sc.parallelize(data, 2)
+
+    val mat: RowMatrix = new RowMatrix(dataRDD)
+
+    // Compute the top 5 singular values and corresponding singular vectors.
+    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
+    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
+    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
+    val V: Matrix = svd.V  // The V factor is a local dense matrix.
+    // $example off$
+    val collect = U.rows.collect()
+    println("U factor is:")
+    collect.foreach { vector => println(vector) }
+    println(s"Singular values are: $s")
+    println(s"V factor is:\n$V")
+  }
+}
+// scalastyle:on println
author	Devaraj K <devaraj@apache.org>	2016-02-22 17:16:56 -0800
committer	Xiangrui Meng <meng@databricks.com>	2016-02-22 17:16:56 -0800
commit	9f410871ca03f4c04bd965b2e4f80167ce543139 (patch)
tree	8c04aa65938c5dbcea96de42463b625ccc0ef313
parent	2063781840831469b394313694bfd25cbde2bb1e (diff)
download	spark-9f410871ca03f4c04bd965b2e4f80167ce543139.tar.gz spark-9f410871ca03f4c04bd965b2e4f80167ce543139.tar.bz2 spark-9f410871ca03f4c04bd965b2e4f80167ce543139.zip