aboutsummaryrefslogtreecommitdiff
path: root/pyspark
diff options
context:
space:
mode:
Diffstat (limited to 'pyspark')
-rw-r--r--pyspark/examples/kmeans.py13
-rwxr-xr-xpyspark/examples/logistic_regression.py (renamed from pyspark/examples/lr.py)4
-rw-r--r--pyspark/examples/pi.py5
-rw-r--r--pyspark/examples/transitive_closure.py (renamed from pyspark/examples/tc.py)5
-rw-r--r--pyspark/examples/wordcount.py4
-rw-r--r--pyspark/pyspark/__init__.py13
6 files changed, 31 insertions, 13 deletions
diff --git a/pyspark/examples/kmeans.py b/pyspark/examples/kmeans.py
index 9cc366f03c..ad2be21178 100644
--- a/pyspark/examples/kmeans.py
+++ b/pyspark/examples/kmeans.py
@@ -1,18 +1,21 @@
+"""
+This example requires numpy (http://www.numpy.org/)
+"""
import sys
-from pyspark.context import SparkContext
-from numpy import array, sum as np_sum
+import numpy as np
+from pyspark import SparkContext
def parseVector(line):
- return array([float(x) for x in line.split(' ')])
+ return np.array([float(x) for x in line.split(' ')])
def closestPoint(p, centers):
bestIndex = 0
closest = float("+inf")
for i in range(len(centers)):
- tempDist = np_sum((p - centers[i]) ** 2)
+ tempDist = np.sum((p - centers[i]) ** 2)
if tempDist < closest:
closest = tempDist
bestIndex = i
@@ -41,7 +44,7 @@ if __name__ == "__main__":
newPoints = pointStats.map(
lambda (x, (y, z)): (x, y / z)).collect()
- tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
+ tempDist = sum(np.sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
for (x, y) in newPoints:
kPoints[x] = y
diff --git a/pyspark/examples/lr.py b/pyspark/examples/logistic_regression.py
index 5fca0266b8..f13698a86f 100755
--- a/pyspark/examples/lr.py
+++ b/pyspark/examples/logistic_regression.py
@@ -7,7 +7,7 @@ from os.path import realpath
import sys
import numpy as np
-from pyspark.context import SparkContext
+from pyspark import SparkContext
N = 100000 # Number of data points
@@ -32,7 +32,7 @@ def generateData():
if __name__ == "__main__":
if len(sys.argv) == 1:
print >> sys.stderr, \
- "Usage: PythonLR <host> [<slices>]"
+ "Usage: PythonLR <master> [<slices>]"
exit(-1)
sc = SparkContext(sys.argv[1], "PythonLR", pyFiles=[realpath(__file__)])
slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2
diff --git a/pyspark/examples/pi.py b/pyspark/examples/pi.py
index 348bbc5dce..127cba029b 100644
--- a/pyspark/examples/pi.py
+++ b/pyspark/examples/pi.py
@@ -1,13 +1,14 @@
import sys
from random import random
from operator import add
-from pyspark.context import SparkContext
+
+from pyspark import SparkContext
if __name__ == "__main__":
if len(sys.argv) == 1:
print >> sys.stderr, \
- "Usage: PythonPi <host> [<slices>]"
+ "Usage: PythonPi <master> [<slices>]"
exit(-1)
sc = SparkContext(sys.argv[1], "PythonPi")
slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2
diff --git a/pyspark/examples/tc.py b/pyspark/examples/transitive_closure.py
index 9630e72b47..73f7f8fbaf 100644
--- a/pyspark/examples/tc.py
+++ b/pyspark/examples/transitive_closure.py
@@ -1,6 +1,7 @@
import sys
from random import Random
-from pyspark.context import SparkContext
+
+from pyspark import SparkContext
numEdges = 200
numVertices = 100
@@ -20,7 +21,7 @@ def generateGraph():
if __name__ == "__main__":
if len(sys.argv) == 1:
print >> sys.stderr, \
- "Usage: PythonTC <host> [<slices>]"
+ "Usage: PythonTC <master> [<slices>]"
exit(-1)
sc = SparkContext(sys.argv[1], "PythonTC")
slices = sys.argv[2] if len(sys.argv) > 2 else 2
diff --git a/pyspark/examples/wordcount.py b/pyspark/examples/wordcount.py
index 8365c070e8..857160624b 100644
--- a/pyspark/examples/wordcount.py
+++ b/pyspark/examples/wordcount.py
@@ -1,6 +1,8 @@
import sys
from operator import add
-from pyspark.context import SparkContext
+
+from pyspark import SparkContext
+
if __name__ == "__main__":
if len(sys.argv) < 3:
diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py
index 8f8402b62b..1ab360a666 100644
--- a/pyspark/pyspark/__init__.py
+++ b/pyspark/pyspark/__init__.py
@@ -1,9 +1,20 @@
+"""
+PySpark is a Python API for Spark.
+
+Public classes:
+
+ - L{SparkContext<pyspark.context.SparkContext>}
+ Main entry point for Spark functionality.
+ - L{RDD<pyspark.rdd.RDD>}
+ A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
+"""
import sys
import os
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg"))
from pyspark.context import SparkContext
+from pyspark.rdd import RDD
-__all__ = ["SparkContext"]
+__all__ = ["SparkContext", "RDD"]