1 files changed, 88 insertions, 4 deletions
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 6b713aa393..f6b97abb17 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -15,19 +15,22 @@
 # limitations under the License.
 #
 
+from numpy import array
+
+from pyspark import RDD
 from pyspark import SparkContext
 from pyspark.mllib.common import callMLlibFunc, callJavaFunc
-from pyspark.mllib.linalg import SparseVector, _convert_to_vector
+from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
+from pyspark.mllib.stat.distribution import MultivariateGaussian
 
-__all__ = ['KMeansModel', 'KMeans']
+__all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture']
 
 
 class KMeansModel(object):
 
     """A clustering model derived from the k-means method.
 
-    >>> from numpy import array
-    >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4,2)
+    >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
     >>> model = KMeans.train(
     ...     sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random")
     >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
@@ -86,6 +89,87 @@ class KMeans(object):
         return KMeansModel([c.toArray() for c in centers])
 
 
+class GaussianMixtureModel(object):
+
+    """A clustering model derived from the Gaussian Mixture Model method.
+
+    >>> clusterdata_1 =  sc.parallelize(array([-0.1,-0.05,-0.01,-0.1,
+    ...                                         0.9,0.8,0.75,0.935,
+    ...                                        -0.83,-0.68,-0.91,-0.76 ]).reshape(6, 2))
+    >>> model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.0001,
+    ...                                 maxIterations=50, seed=10)
+    >>> labels = model.predict(clusterdata_1).collect()
+    >>> labels[0]==labels[1]
+    False
+    >>> labels[1]==labels[2]
+    True
+    >>> labels[4]==labels[5]
+    True
+    >>> clusterdata_2 =  sc.parallelize(array([-5.1971, -2.5359, -3.8220,
+    ...                                        -5.2211, -5.0602,  4.7118,
+    ...                                         6.8989, 3.4592,  4.6322,
+    ...                                         5.7048,  4.6567, 5.5026,
+    ...                                         4.5605,  5.2043,  6.2734]).reshape(5, 3))
+    >>> model = GaussianMixture.train(clusterdata_2, 2, convergenceTol=0.0001,
+    ...                                 maxIterations=150, seed=10)
+    >>> labels = model.predict(clusterdata_2).collect()
+    >>> labels[0]==labels[1]==labels[2]
+    True
+    >>> labels[3]==labels[4]
+    True
+    """
+
+    def __init__(self, weights, gaussians):
+        self.weights = weights
+        self.gaussians = gaussians
+        self.k = len(self.weights)
+
+    def predict(self, x):
+        """
+        Find the cluster to which the points in 'x' has maximum membership
+        in this model.
+
+        :param x:    RDD of data points.
+        :return:     cluster_labels. RDD of cluster labels.
+        """
+        if isinstance(x, RDD):
+            cluster_labels = self.predictSoft(x).map(lambda z: z.index(max(z)))
+            return cluster_labels
+
+    def predictSoft(self, x):
+        """
+        Find the membership of each point in 'x' to all mixture components.
+
+        :param x:    RDD of data points.
+        :return:     membership_matrix. RDD of array of double values.
+        """
+        if isinstance(x, RDD):
+            means, sigmas = zip(*[(g.mu, g.sigma) for g in self.gaussians])
+            membership_matrix = callMLlibFunc("predictSoftGMM", x.map(_convert_to_vector),
+                                              self.weights, means, sigmas)
+            return membership_matrix
+
+
+class GaussianMixture(object):
+    """
+    Estimate model parameters with the expectation-maximization algorithm.
+
+    :param data:            RDD of data points
+    :param k:               Number of components
+    :param convergenceTol:  Threshold value to check the convergence criteria. Defaults to 1e-3
+    :param maxIterations:   Number of iterations. Default to 100
+    :param seed:            Random Seed
+    """
+    @classmethod
+    def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None):
+        """Train a Gaussian Mixture clustering model."""
+        weight, mu, sigma = callMLlibFunc("trainGaussianMixture",
+                                          rdd.map(_convert_to_vector), k,
+                                          convergenceTol, maxIterations, seed)
+        mvg_obj = [MultivariateGaussian(mu[i], sigma[i]) for i in range(k)]
+        return GaussianMixtureModel(weight, mvg_obj)
+
+
 def _test():
     import doctest
     globs = globals().copy()