diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2015-06-21 12:04:20 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2015-06-21 12:04:20 -0700 |
commit | 32e3cdaa647722671adcb5068bd5ffbf2f157806 (patch) | |
tree | 9083af7080e560cb1224813a285a69aa4b19becf /python/pyspark | |
parent | a1e3649c8775d71ca78796b6544284e942ac1331 (diff) | |
download | spark-32e3cdaa647722671adcb5068bd5ffbf2f157806.tar.gz spark-32e3cdaa647722671adcb5068bd5ffbf2f157806.tar.bz2 spark-32e3cdaa647722671adcb5068bd5ffbf2f157806.zip |
[SPARK-7604] [MLLIB] Python API for PCA and PCAModel
Python API for PCA and PCAModel
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #6315 from yanboliang/spark-7604 and squashes the following commits:
1d58734 [Yanbo Liang] remove transform() in PCAModel, use default behavior
4d9d121 [Yanbo Liang] Python API for PCA and PCAModel
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/mllib/feature.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index cf5fdf2cf9..334f5b86cd 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -252,6 +252,41 @@ class ChiSqSelector(object): return ChiSqSelectorModel(jmodel) +class PCAModel(JavaVectorTransformer): + """ + Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA. + """ + + +class PCA(object): + """ + A feature transformer that projects vectors to a low-dimensional space using PCA. + + >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), + ... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), + ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])] + >>> model = PCA(2).fit(sc.parallelize(data)) + >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray() + >>> pcArray[0] + 1.648... + >>> pcArray[1] + -4.013... + """ + def __init__(self, k): + """ + :param k: number of principal components. + """ + self.k = int(k) + + def fit(self, data): + """ + Computes a [[PCAModel]] that contains the principal components of the input vectors. + :param data: source vectors + """ + jmodel = callMLlibFunc("fitPCA", self.k, data) + return PCAModel(jmodel) + + class HashingTF(object): """ .. note:: Experimental |