From 244016a95c43ce6db422378e85a9d527bfe59bf1 Mon Sep 17 00:00:00 2001 From: Yu ISHIKAWA Date: Sun, 2 Aug 2015 09:00:32 +0100 Subject: [SPARK-9149] [ML] [EXAMPLES] Add an example of spark.ml KMeans [SPARK-9149] Add an example of spark.ml KMeans - ASF JIRA https://issues.apache.org/jira/browse/SPARK-9149 jkbradley Should we support other data formats, such as TSV or CSV. I have implemented these examples which support only space separated file which is same as the example for `spark.mllib`'s `KMeans`. Author: Yu ISHIKAWA Closes #7697 from yu-iskw/SPARK-9149 and squashes the following commits: 7137bad [Yu ISHIKAWA] Fix the typo 56b9da2 [Yu ISHIKAWA] Fix the place of the wrong import statment 554e574 [Yu ISHIKAWA] Change the way to format input data in KMeansExample e7a948a [Yu ISHIKAWA] Import spark.ml.clustering.KMeans 1901e0c [Yu ISHIKAWA] Change how to initialize an array for a DataFrame schema d8043f5 [Yu ISHIKAWA] Return a value directly d81bf55 [Yu ISHIKAWA] Fix a typo and its access specifiers 3e0862d [Yu ISHIKAWA] Make KMeansExample more simple 51ce9c1 [Yu ISHIKAWA] Make JavaKMeansExample more simple a5a01e0 [Yu ISHIKAWA] Fix a Javadoc about the command to execute the example b09ec13 [Yu ISHIKAWA] [SPARK-9149][ML][Examples] Add an example of spark.ml KMeans --- examples/src/main/python/ml/kmeans_example.py | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 examples/src/main/python/ml/kmeans_example.py (limited to 'examples/src/main/python/ml') diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py new file mode 100644 index 0000000000..150dadd42f --- /dev/null +++ b/examples/src/main/python/ml/kmeans_example.py @@ -0,0 +1,71 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function + +import sys +import re + +import numpy as np +from pyspark import SparkContext +from pyspark.ml.clustering import KMeans, KMeansModel +from pyspark.mllib.linalg import VectorUDT, _convert_to_vector +from pyspark.sql import SQLContext +from pyspark.sql.types import Row, StructField, StructType + +""" +A simple example demonstrating a k-means clustering. +Run with: + bin/spark-submit examples/src/main/python/ml/kmeans_example.py + +This example requires NumPy (http://www.numpy.org/). +""" + + +def parseVector(line): + array = np.array([float(x) for x in line.split(' ')]) + return _convert_to_vector(array) + + +if __name__ == "__main__": + + FEATURES_COL = "features" + + if len(sys.argv) != 3: + print("Usage: kmeans_example.py ", file=sys.stderr) + exit(-1) + path = sys.argv[1] + k = sys.argv[2] + + sc = SparkContext(appName="PythonKMeansExample") + sqlContext = SQLContext(sc) + + lines = sc.textFile(path) + data = lines.map(parseVector) + row_rdd = data.map(lambda x: Row(x)) + schema = StructType([StructField(FEATURES_COL, VectorUDT(), False)]) + df = sqlContext.createDataFrame(row_rdd, schema) + + kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol(FEATURES_COL) + model = kmeans.fit(df) + centers = model.clusterCenters() + + print("Cluster Centers: ") + for center in centers: + print(center) + + sc.stop() -- cgit v1.2.3