2 files changed, 94 insertions, 0 deletions
diff --git a/examples/src/main/python/mllib/kmeans.py b/examples/src/main/python/mllib/kmeans.py
new file mode 100755
index 0000000000..dec82ff34f
--- /dev/null
+++ b/examples/src/main/python/mllib/kmeans.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A K-means clustering program using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.mllib.clustering import KMeans
+
+
+def parseVector(line):
+    return np.array([float(x) for x in line.split(' ')])
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print >> sys.stderr, "Usage: kmeans <master> <file> <k>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "KMeans")
+    lines = sc.textFile(sys.argv[2])
+    data = lines.map(parseVector)
+    k = int(sys.argv[3])
+    model = KMeans.train(data, k)
+    print "Final centers: " + str(model.clusterCenters)
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
new file mode 100755
index 0000000000..8631051d00
--- /dev/null
+++ b/examples/src/main/python/mllib/logistic_regression.py
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Logistic regression using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+from math import exp
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.classification import LogisticRegressionWithSGD
+
+
+# Parse a line of text into an MLlib LabeledPoint object
+def parsePoint(line):
+    values = [float(s) for s in line.split(' ')]
+    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
+        values[0] = 0
+    return LabeledPoint(values[0], values[1:])
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print >> sys.stderr, "Usage: logistic_regression <master> <file> <iters>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonLR")
+    points = sc.textFile(sys.argv[2]).map(parsePoint)
+    iterations = int(sys.argv[3])
+    model = LogisticRegressionWithSGD.train(points, iterations)
+    print "Final weights: " + str(model.weights)
+    print "Final intercept: " + str(model.intercept)