From 830666f6fe1e77faa39eed2c1c3cd8e83bc93ef9 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 17 Jul 2015 14:08:06 -0700 Subject: [SPARK-8792] [ML] Add Python API for PCA transformer Add Python API for PCA transformer Author: Yanbo Liang Closes #7190 from yanboliang/spark-8792 and squashes the following commits: 8f4ac31 [Yanbo Liang] address comments 8a79cc0 [Yanbo Liang] Add Python API for PCA transformer --- python/pyspark/ml/feature.py | 64 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) (limited to 'python/pyspark/ml/feature.py') diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 9bca7cc000..86e654dd07 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -24,7 +24,7 @@ from pyspark.mllib.common import inherit_doc __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', - 'Word2Vec', 'Word2VecModel'] + 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel'] @inherit_doc @@ -1048,6 +1048,68 @@ class Word2VecModel(JavaModel): """ +@inherit_doc +class PCA(JavaEstimator, HasInputCol, HasOutputCol): + """ + PCA trains a model to project vectors to a low-dimensional space using PCA. + + >>> from pyspark.mllib.linalg import Vectors + >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), + ... (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), + ... (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] + >>> df = sqlContext.createDataFrame(data,["features"]) + >>> pca = PCA(k=2, inputCol="features", outputCol="pca_features") + >>> model = pca.fit(df) + >>> model.transform(df).collect()[0].pca_features + DenseVector([1.648..., -4.013...]) + """ + + # a placeholder to make it appear in the generated doc + k = Param(Params._dummy(), "k", "the number of principal components") + + @keyword_only + def __init__(self, k=None, inputCol=None, outputCol=None): + """ + __init__(self, k=None, inputCol=None, outputCol=None) + """ + super(PCA, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid) + self.k = Param(self, "k", "the number of principal components") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, k=None, inputCol=None, outputCol=None): + """ + setParams(self, k=None, inputCol=None, outputCol=None) + Set params for this PCA. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setK(self, value): + """ + Sets the value of :py:attr:`k`. + """ + self._paramMap[self.k] = value + return self + + def getK(self): + """ + Gets the value of k or its default value. + """ + return self.getOrDefault(self.k) + + def _create_model(self, java_model): + return PCAModel(java_model) + + +class PCAModel(JavaModel): + """ + Model fitted by PCA. + """ + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext -- cgit v1.2.3