From 7450a992b3b543a373c34fc4444a528954ac4b4a Mon Sep 17 00:00:00 2001 From: "nate.crosswhite" Date: Wed, 21 Jan 2015 10:32:10 -0800 Subject: [SPARK-4749] [mllib]: Allow initializing KMeans clusters using a seed This implements the functionality for SPARK-4749 and provides units tests in Scala and PySpark Author: nate.crosswhite Author: nxwhite-str Author: Xiangrui Meng Closes #3610 from nxwhite-str/master and squashes the following commits: a2ebbd3 [nxwhite-str] Merge pull request #1 from mengxr/SPARK-4749-kmeans-seed 7668124 [Xiangrui Meng] minor updates f8d5928 [nate.crosswhite] Addressing PR issues 277d367 [nate.crosswhite] Merge remote-tracking branch 'upstream/master' 9156a57 [nate.crosswhite] Merge remote-tracking branch 'upstream/master' 5d087b4 [nate.crosswhite] Adding KMeans train with seed and Scala unit test 616d111 [nate.crosswhite] Merge remote-tracking branch 'upstream/master' 35c1884 [nate.crosswhite] Add kmeans initial seed to pyspark API --- python/pyspark/mllib/clustering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python/pyspark/mllib/clustering.py') diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index e2492eef5b..6b713aa393 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -78,10 +78,10 @@ class KMeansModel(object): class KMeans(object): @classmethod - def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): + def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||", seed=None): """Train a k-means clustering model.""" model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), k, maxIterations, - runs, initializationMode) + runs, initializationMode, seed) centers = callJavaFunc(rdd.context, model.clusterCenters) return KMeansModel([c.toArray() for c in centers]) -- cgit v1.2.3