4 files changed, 107 insertions, 6 deletions
diff --git a/python/examples/kmeans.py b/python/examples/kmeans.py
index ba31af92fc..d8387b0b18 100755
--- a/python/examples/kmeans.py
+++ b/python/examples/kmeans.py
@@ -16,8 +16,13 @@
 #
 
 """
-This example requires numpy (http://www.numpy.org/)
+The K-means algorithm written from scratch against PySpark. In practice,
+one may prefer to use the KMeans algorithm in MLlib, as shown in
+python/examples/mllib/kmeans.py.
+
+This example requires NumPy (http://www.numpy.org/).
 """
+
 import sys
 
 import numpy as np
@@ -49,9 +54,7 @@ if __name__ == "__main__":
     K = int(sys.argv[3])
     convergeDist = float(sys.argv[4])
 
-    # TODO: change this after we port takeSample()
-    #kPoints = data.takeSample(False, K, 34)
-    kPoints = data.take(K)
+    kPoints = data.takeSample(False, K, 1)
     tempDist = 1.0
 
     while tempDist > convergeDist:
diff --git a/python/examples/logistic_regression.py b/python/examples/logistic_regression.py
index 1117dea538..28d52e6a40 100755
--- a/python/examples/logistic_regression.py
+++ b/python/examples/logistic_regression.py
@@ -16,9 +16,13 @@
 #
 
 """
-A logistic regression implementation that uses NumPy (http://www.numpy.org) to act on batches
-of input data using efficient matrix operations.
+A logistic regression implementation that uses NumPy (http://www.numpy.org)
+to act on batches of input data using efficient matrix operations.
+
+In practice, one may prefer to use the LogisticRegression algorithm in
+MLlib, as shown in python/examples/mllib/logistic_regression.py.
 """
+
 from collections import namedtuple
 from math import exp
 from os.path import realpath
diff --git a/python/examples/mllib/kmeans.py b/python/examples/mllib/kmeans.py
new file mode 100755
index 0000000000..dec82ff34f
--- /dev/null
+++ b/python/examples/mllib/kmeans.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A K-means clustering program using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.mllib.clustering import KMeans
+
+
+def parseVector(line):
+    return np.array([float(x) for x in line.split(' ')])
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print >> sys.stderr, "Usage: kmeans <master> <file> <k>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "KMeans")
+    lines = sc.textFile(sys.argv[2])
+    data = lines.map(parseVector)
+    k = int(sys.argv[3])
+    model = KMeans.train(data, k)
+    print "Final centers: " + str(model.clusterCenters)
diff --git a/python/examples/mllib/logistic_regression.py b/python/examples/mllib/logistic_regression.py
new file mode 100755
index 0000000000..8631051d00
--- /dev/null
+++ b/python/examples/mllib/logistic_regression.py
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Logistic regression using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+from math import exp
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.classification import LogisticRegressionWithSGD
+
+
+# Parse a line of text into an MLlib LabeledPoint object
+def parsePoint(line):
+    values = [float(s) for s in line.split(' ')]
+    if values[0] == -1:   # Convert -1 labels to 0 for MLlib
+        values[0] = 0
+    return LabeledPoint(values[0], values[1:])
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print >> sys.stderr, "Usage: logistic_regression <master> <file> <iters>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonLR")
+    points = sc.textFile(sys.argv[2]).map(parsePoint)
+    iterations = int(sys.argv[3])
+    model = LogisticRegressionWithSGD.train(points, iterations)
+    print "Final weights: " + str(model.weights)
+    print "Final intercept: " + str(model.intercept)