aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-06-21 12:04:20 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-06-21 12:04:20 -0700
commit32e3cdaa647722671adcb5068bd5ffbf2f157806 (patch)
tree9083af7080e560cb1224813a285a69aa4b19becf /python/pyspark
parenta1e3649c8775d71ca78796b6544284e942ac1331 (diff)
downloadspark-32e3cdaa647722671adcb5068bd5ffbf2f157806.tar.gz
spark-32e3cdaa647722671adcb5068bd5ffbf2f157806.tar.bz2
spark-32e3cdaa647722671adcb5068bd5ffbf2f157806.zip
[SPARK-7604] [MLLIB] Python API for PCA and PCAModel
Python API for PCA and PCAModel Author: Yanbo Liang <ybliang8@gmail.com> Closes #6315 from yanboliang/spark-7604 and squashes the following commits: 1d58734 [Yanbo Liang] remove transform() in PCAModel, use default behavior 4d9d121 [Yanbo Liang] Python API for PCA and PCAModel
Diffstat (limited to 'python/pyspark')
-rw-r--r--python/pyspark/mllib/feature.py35
1 files changed, 35 insertions, 0 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index cf5fdf2cf9..334f5b86cd 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -252,6 +252,41 @@ class ChiSqSelector(object):
return ChiSqSelectorModel(jmodel)
+class PCAModel(JavaVectorTransformer):
+ """
+ Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
+ """
+
+
+class PCA(object):
+ """
+ A feature transformer that projects vectors to a low-dimensional space using PCA.
+
+ >>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
+ ... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
+ ... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
+ >>> model = PCA(2).fit(sc.parallelize(data))
+ >>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
+ >>> pcArray[0]
+ 1.648...
+ >>> pcArray[1]
+ -4.013...
+ """
+ def __init__(self, k):
+ """
+ :param k: number of principal components.
+ """
+ self.k = int(k)
+
+ def fit(self, data):
+ """
+ Computes a [[PCAModel]] that contains the principal components of the input vectors.
+ :param data: source vectors
+ """
+ jmodel = callMLlibFunc("fitPCA", self.k, data)
+ return PCAModel(jmodel)
+
+
class HashingTF(object):
"""
.. note:: Experimental