aboutsummaryrefslogtreecommitdiff
path: root/python/examples
diff options
context:
space:
mode:
Diffstat (limited to 'python/examples')
-rwxr-xr-xpython/examples/kmeans.py11
-rwxr-xr-xpython/examples/logistic_regression.py8
-rwxr-xr-xpython/examples/mllib/kmeans.py44
-rwxr-xr-xpython/examples/mllib/logistic_regression.py50
4 files changed, 107 insertions, 6 deletions
diff --git a/python/examples/kmeans.py b/python/examples/kmeans.py
index ba31af92fc..d8387b0b18 100755
--- a/python/examples/kmeans.py
+++ b/python/examples/kmeans.py
@@ -16,8 +16,13 @@
#
"""
-This example requires numpy (http://www.numpy.org/)
+The K-means algorithm written from scratch against PySpark. In practice,
+one may prefer to use the KMeans algorithm in MLlib, as shown in
+python/examples/mllib/kmeans.py.
+
+This example requires NumPy (http://www.numpy.org/).
"""
+
import sys
import numpy as np
@@ -49,9 +54,7 @@ if __name__ == "__main__":
K = int(sys.argv[3])
convergeDist = float(sys.argv[4])
- # TODO: change this after we port takeSample()
- #kPoints = data.takeSample(False, K, 34)
- kPoints = data.take(K)
+ kPoints = data.takeSample(False, K, 1)
tempDist = 1.0
while tempDist > convergeDist:
diff --git a/python/examples/logistic_regression.py b/python/examples/logistic_regression.py
index 1117dea538..28d52e6a40 100755
--- a/python/examples/logistic_regression.py
+++ b/python/examples/logistic_regression.py
@@ -16,9 +16,13 @@
#
"""
-A logistic regression implementation that uses NumPy (http://www.numpy.org) to act on batches
-of input data using efficient matrix operations.
+A logistic regression implementation that uses NumPy (http://www.numpy.org)
+to act on batches of input data using efficient matrix operations.
+
+In practice, one may prefer to use the LogisticRegression algorithm in
+MLlib, as shown in python/examples/mllib/logistic_regression.py.
"""
+
from collections import namedtuple
from math import exp
from os.path import realpath
diff --git a/python/examples/mllib/kmeans.py b/python/examples/mllib/kmeans.py
new file mode 100755
index 0000000000..dec82ff34f
--- /dev/null
+++ b/python/examples/mllib/kmeans.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A K-means clustering program using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.mllib.clustering import KMeans
+
+
+def parseVector(line):
+ return np.array([float(x) for x in line.split(' ')])
+
+
+if __name__ == "__main__":
+ if len(sys.argv) < 4:
+ print >> sys.stderr, "Usage: kmeans <master> <file> <k>"
+ exit(-1)
+ sc = SparkContext(sys.argv[1], "KMeans")
+ lines = sc.textFile(sys.argv[2])
+ data = lines.map(parseVector)
+ k = int(sys.argv[3])
+ model = KMeans.train(data, k)
+ print "Final centers: " + str(model.clusterCenters)
diff --git a/python/examples/mllib/logistic_regression.py b/python/examples/mllib/logistic_regression.py
new file mode 100755
index 0000000000..8631051d00
--- /dev/null
+++ b/python/examples/mllib/logistic_regression.py
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Logistic regression using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+from math import exp
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.classification import LogisticRegressionWithSGD
+
+
+# Parse a line of text into an MLlib LabeledPoint object
+def parsePoint(line):
+ values = [float(s) for s in line.split(' ')]
+ if values[0] == -1: # Convert -1 labels to 0 for MLlib
+ values[0] = 0
+ return LabeledPoint(values[0], values[1:])
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 4:
+ print >> sys.stderr, "Usage: logistic_regression <master> <file> <iters>"
+ exit(-1)
+ sc = SparkContext(sys.argv[1], "PythonLR")
+ points = sc.textFile(sys.argv[2]).map(parsePoint)
+ iterations = int(sys.argv[3])
+ model = LogisticRegressionWithSGD.train(points, iterations)
+ print "Final weights: " + str(model.weights)
+ print "Final intercept: " + str(model.intercept)