aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
authorHolden Karau <holden@us.ibm.com>2016-01-19 10:15:54 -0800
committerJoseph K. Bradley <joseph@databricks.com>2016-01-19 10:15:54 -0800
commit0ddba6d88ff093a96b4931f71bd0a599afbbca78 (patch)
tree331730923cc4d2d1bcd1b18a0c2829e4e5a6dbd5 /python/pyspark/mllib
parentebd9ce0f1f55f7d2d3bd3b92c4b0a495c51ac6fd (diff)
downloadspark-0ddba6d88ff093a96b4931f71bd0a599afbbca78.tar.gz
spark-0ddba6d88ff093a96b4931f71bd0a599afbbca78.tar.bz2
spark-0ddba6d88ff093a96b4931f71bd0a599afbbca78.zip
[SPARK-11944][PYSPARK][MLLIB] python mllib.clustering.bisecting k means
From the coverage issues for 1.6 : Add Python API for mllib.clustering.BisectingKMeans. Author: Holden Karau <holden@us.ibm.com> Closes #10150 from holdenk/SPARK-11937-python-api-coverage-SPARK-11944-python-mllib.clustering.BisectingKMeans.
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/clustering.py136
-rw-r--r--python/pyspark/mllib/tests.py11
2 files changed, 142 insertions, 5 deletions
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index 580cb512d8..4e9eb96fd9 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -38,13 +38,130 @@ from pyspark.mllib.stat.distribution import MultivariateGaussian
from pyspark.mllib.util import Saveable, Loader, inherit_doc, JavaLoader, JavaSaveable
from pyspark.streaming import DStream
-__all__ = ['KMeansModel', 'KMeans', 'GaussianMixtureModel', 'GaussianMixture',
- 'PowerIterationClusteringModel', 'PowerIterationClustering',
- 'StreamingKMeans', 'StreamingKMeansModel',
+__all__ = ['BisectingKMeansModel', 'BisectingKMeans', 'KMeansModel', 'KMeans',
+ 'GaussianMixtureModel', 'GaussianMixture', 'PowerIterationClusteringModel',
+ 'PowerIterationClustering', 'StreamingKMeans', 'StreamingKMeansModel',
'LDA', 'LDAModel']
@inherit_doc
+class BisectingKMeansModel(JavaModelWrapper):
+ """
+ .. note:: Experimental
+
+ A clustering model derived from the bisecting k-means method.
+
+ >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
+ >>> bskm = BisectingKMeans()
+ >>> model = bskm.train(sc.parallelize(data, 2), k=4)
+ >>> p = array([0.0, 0.0])
+ >>> model.predict(p)
+ 0
+ >>> model.k
+ 4
+ >>> model.computeCost(p)
+ 0.0
+
+ .. versionadded:: 2.0.0
+ """
+
+ def __init__(self, java_model):
+ super(BisectingKMeansModel, self).__init__(java_model)
+ self.centers = [c.toArray() for c in self.call("clusterCenters")]
+
+ @property
+ @since('2.0.0')
+ def clusterCenters(self):
+ """Get the cluster centers, represented as a list of NumPy
+ arrays."""
+ return self.centers
+
+ @property
+ @since('2.0.0')
+ def k(self):
+ """Get the number of clusters"""
+ return self.call("k")
+
+ @since('2.0.0')
+ def predict(self, x):
+ """
+ Find the cluster that each of the points belongs to in this
+ model.
+
+ :param x: the point (or RDD of points) to determine
+ compute the clusters for.
+ """
+ if isinstance(x, RDD):
+ vecs = x.map(_convert_to_vector)
+ return self.call("predict", vecs)
+
+ x = _convert_to_vector(x)
+ return self.call("predict", x)
+
+ @since('2.0.0')
+ def computeCost(self, x):
+ """
+ Return the Bisecting K-means cost (sum of squared distances of
+ points to their nearest center) for this model on the given
+ data. If provided with an RDD of points returns the sum.
+
+ :param point: the point or RDD of points to compute the cost(s).
+ """
+ if isinstance(x, RDD):
+ vecs = x.map(_convert_to_vector)
+ return self.call("computeCost", vecs)
+
+ return self.call("computeCost", _convert_to_vector(x))
+
+
+class BisectingKMeans(object):
+ """
+ .. note:: Experimental
+
+ A bisecting k-means algorithm based on the paper "A comparison of
+ document clustering techniques" by Steinbach, Karypis, and Kumar,
+ with modification to fit Spark.
+ The algorithm starts from a single cluster that contains all points.
+ Iteratively it finds divisible clusters on the bottom level and
+ bisects each of them using k-means, until there are `k` leaf
+ clusters in total or no leaf clusters are divisible.
+ The bisecting steps of clusters on the same level are grouped
+ together to increase parallelism. If bisecting all divisible
+ clusters on the bottom level would result more than `k` leaf
+ clusters, larger clusters get higher priority.
+
+ Based on
+ U{http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf}
+ Steinbach, Karypis, and Kumar, A comparison of document clustering
+ techniques, KDD Workshop on Text Mining, 2000.
+
+ .. versionadded:: 2.0.0
+ """
+
+ @since('2.0.0')
+ def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604):
+ """
+ Runs the bisecting k-means algorithm return the model.
+
+ :param rdd: input RDD to be trained on
+ :param k: The desired number of leaf clusters (default: 4).
+ The actual number could be smaller if there are no divisible
+ leaf clusters.
+ :param maxIterations: the max number of k-means iterations to
+ split clusters (default: 20)
+ :param minDivisibleClusterSize: the minimum number of points
+ (if >= 1.0) or the minimum proportion of points (if < 1.0)
+ of a divisible cluster (default: 1)
+ :param seed: a random seed (default: -1888008604 from
+ classOf[BisectingKMeans].getName.##)
+ """
+ java_model = callMLlibFunc(
+ "trainBisectingKMeans", rdd.map(_convert_to_vector),
+ k, maxIterations, minDivisibleClusterSize, seed)
+ return BisectingKMeansModel(java_model)
+
+
+@inherit_doc
class KMeansModel(Saveable, Loader):
"""A clustering model derived from the k-means method.
@@ -118,7 +235,13 @@ class KMeansModel(Saveable, Loader):
@since('0.9.0')
def predict(self, x):
- """Find the cluster to which x belongs in this model."""
+ """
+ Find the cluster that each of the points belongs to in this
+ model.
+
+ :param x: the point (or RDD of points) to determine
+ compute the clusters for.
+ """
best = 0
best_distance = float("inf")
if isinstance(x, RDD):
@@ -136,7 +259,10 @@ class KMeansModel(Saveable, Loader):
def computeCost(self, rdd):
"""
Return the K-means cost (sum of squared distances of points to
- their nearest center) for this model on the given data.
+ their nearest center) for this model on the given
+ data.
+
+ :param point: the RDD of points to compute the cost on.
"""
cost = callMLlibFunc("computeCostKmeansModel", rdd.map(_convert_to_vector),
[_convert_to_vector(c) for c in self.centers])
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 3436a28b29..32ed48e103 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -419,6 +419,17 @@ class ListTests(MLlibTestCase):
as NumPy arrays.
"""
+ def test_bisecting_kmeans(self):
+ from pyspark.mllib.clustering import BisectingKMeans
+ data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2)
+ bskm = BisectingKMeans()
+ model = bskm.train(sc.parallelize(data, 2), k=4)
+ p = array([0.0, 0.0])
+ rdd_p = self.sc.parallelize([p])
+ self.assertEqual(model.predict(p), model.predict(rdd_p).first())
+ self.assertEqual(model.computeCost(p), model.computeCost(rdd_p))
+ self.assertEqual(model.k, len(model.clusterCenters))
+
def test_kmeans(self):
from pyspark.mllib.clustering import KMeans
data = [