aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-07-17 14:08:06 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-07-17 14:08:06 -0700
commit830666f6fe1e77faa39eed2c1c3cd8e83bc93ef9 (patch)
tree7220ca02ff1fb2e6be9e3bed133b25ce23cb6f93 /python
parent6da1069696186572c66cbd83947c1a1dbd2bc827 (diff)
downloadspark-830666f6fe1e77faa39eed2c1c3cd8e83bc93ef9.tar.gz
spark-830666f6fe1e77faa39eed2c1c3cd8e83bc93ef9.tar.bz2
spark-830666f6fe1e77faa39eed2c1c3cd8e83bc93ef9.zip
[SPARK-8792] [ML] Add Python API for PCA transformer
Add Python API for PCA transformer Author: Yanbo Liang <ybliang8@gmail.com> Closes #7190 from yanboliang/spark-8792 and squashes the following commits: 8f4ac31 [Yanbo Liang] address comments 8a79cc0 [Yanbo Liang] Add Python API for PCA transformer
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/feature.py64
1 files changed, 63 insertions, 1 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 9bca7cc000..86e654dd07 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -24,7 +24,7 @@ from pyspark.mllib.common import inherit_doc
__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder',
'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel',
'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
- 'Word2Vec', 'Word2VecModel']
+ 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel']
@inherit_doc
@@ -1048,6 +1048,68 @@ class Word2VecModel(JavaModel):
"""
+@inherit_doc
+class PCA(JavaEstimator, HasInputCol, HasOutputCol):
+ """
+ PCA trains a model to project vectors to a low-dimensional space using PCA.
+
+ >>> from pyspark.mllib.linalg import Vectors
+ >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+ ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+ ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+ >>> df = sqlContext.createDataFrame(data,["features"])
+ >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features")
+ >>> model = pca.fit(df)
+ >>> model.transform(df).collect()[0].pca_features
+ DenseVector([1.648..., -4.013...])
+ """
+
+ # a placeholder to make it appear in the generated doc
+ k = Param(Params._dummy(), "k", "the number of principal components")
+
+ @keyword_only
+ def __init__(self, k=None, inputCol=None, outputCol=None):
+ """
+ __init__(self, k=None, inputCol=None, outputCol=None)
+ """
+ super(PCA, self).__init__()
+ self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
+ self.k = Param(self, "k", "the number of principal components")
+ kwargs = self.__init__._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ def setParams(self, k=None, inputCol=None, outputCol=None):
+ """
+ setParams(self, k=None, inputCol=None, outputCol=None)
+ Set params for this PCA.
+ """
+ kwargs = self.setParams._input_kwargs
+ return self._set(**kwargs)
+
+ def setK(self, value):
+ """
+ Sets the value of :py:attr:`k`.
+ """
+ self._paramMap[self.k] = value
+ return self
+
+ def getK(self):
+ """
+ Gets the value of k or its default value.
+ """
+ return self.getOrDefault(self.k)
+
+ def _create_model(self, java_model):
+ return PCAModel(java_model)
+
+
+class PCAModel(JavaModel):
+ """
+ Model fitted by PCA.
+ """
+
+
if __name__ == "__main__":
import doctest
from pyspark.context import SparkContext