aboutsummaryrefslogtreecommitdiff
path: root/docs/ml-features.md
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-08-03 13:58:00 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-08-03 13:58:00 -0700
commit8ca287ebbd58985a568341b08040d0efa9d3641a (patch)
treef2c3fc5f5a680483cdbf3589ce134832d1162b24 /docs/ml-features.md
parentba1c4e138de2ea84b55def4eed2bd363e60aea4d (diff)
downloadspark-8ca287ebbd58985a568341b08040d0efa9d3641a.tar.gz
spark-8ca287ebbd58985a568341b08040d0efa9d3641a.tar.bz2
spark-8ca287ebbd58985a568341b08040d0efa9d3641a.zip
[SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples
Add ml.PCA user guide document and code examples for Scala/Java/Python. Author: Yanbo Liang <ybliang8@gmail.com> Closes #7522 from yanboliang/ml-pca-md and squashes the following commits: 60dec05 [Yanbo Liang] address comments f992abe [Yanbo Liang] Add ml.PCA doc and examples
Diffstat (limited to 'docs/ml-features.md')
-rw-r--r--docs/ml-features.md86
1 files changed, 86 insertions, 0 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 54068debe2..fa0ad1f00a 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -461,6 +461,92 @@ for binarized_feature, in binarizedFeatures.collect():
</div>
</div>
+## PCA
+
+[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A [PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details.
+{% highlight scala %}
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Array(
+ Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+ Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+ Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+)
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df)
+val pcaDF = pca.transform(df)
+val result = pcaDF.select("pcaFeatures")
+result.show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details.
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.ml.feature.PCAModel
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaSparkContext jsc = ...
+SQLContext jsql = ...
+JavaRDD<Row> data = jsc.parallelize(Lists.newArrayList(
+ RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
+ RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+ RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+PCAModel pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df);
+DataFrame result = pca.transform(df).select("pcaFeatures");
+result.show();
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details.
+{% highlight python %}
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+ (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+ (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+df = sqlContext.createDataFrame(data,["features"])
+pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+model = pca.fit(df)
+result = model.transform(df).select("pcaFeatures")
+result.show(truncate=False)
+{% endhighlight %}
+</div>
+</div>
+
## PolynomialExpansion
[Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree combination of original dimensions. A [PolynomialExpansion](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion) class provides this functionality. The example below shows how to expand your features into a 3-degree polynomial space.