aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
authorNicholas Chammas <nicholas.chammas@gmail.com>2014-08-06 12:58:24 -0700
committerReynold Xin <rxin@apache.org>2014-08-06 12:58:24 -0700
commitd614967b0bad1e6c5277d612602ec0a653a00258 (patch)
tree8df1a52cbe074af4f928c0ac8f08a63075882d0b /python/pyspark/mllib
parenta6cd31108f0d73ce6823daafe8447677e03cfd13 (diff)
downloadspark-d614967b0bad1e6c5277d612602ec0a653a00258.tar.gz
spark-d614967b0bad1e6c5277d612602ec0a653a00258.tar.bz2
spark-d614967b0bad1e6c5277d612602ec0a653a00258.zip
[SPARK-2627] [PySpark] have the build enforce PEP 8 automatically
As described in [SPARK-2627](https://issues.apache.org/jira/browse/SPARK-2627), we'd like Python code to automatically be checked for PEP 8 compliance by Jenkins. This pull request aims to do that. Notes: * We may need to install [`pep8`](https://pypi.python.org/pypi/pep8) on the build server. * I'm expecting tests to fail now that PEP 8 compliance is being checked as part of the build. I'm fine with cleaning up any remaining PEP 8 violations as part of this pull request. * I did not understand why the RAT and scalastyle reports are saved to text files. I did the same for the PEP 8 check, but only so that the console output style can match those for the RAT and scalastyle checks. The PEP 8 report is removed right after the check is complete. * Updates to the ["Contributing to Spark"](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) guide will be submitted elsewhere, as I don't believe that text is part of the Spark repo. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1744 from nchammas/master and squashes the following commits: 274b238 [Nicholas Chammas] [SPARK-2627] [PySpark] minor indentation changes 983d963 [nchammas] Merge pull request #5 from apache/master 1db5314 [nchammas] Merge pull request #4 from apache/master 0e0245f [Nicholas Chammas] [SPARK-2627] undo erroneous whitespace fixes bf30942 [Nicholas Chammas] [SPARK-2627] PEP8: comment spacing 6db9a44 [nchammas] Merge pull request #3 from apache/master 7b4750e [Nicholas Chammas] merge upstream changes 91b7584 [Nicholas Chammas] [SPARK-2627] undo unnecessary line breaks 44e3e56 [Nicholas Chammas] [SPARK-2627] use tox.ini to exclude files b09fae2 [Nicholas Chammas] don't wrap comments unnecessarily bfb9f9f [Nicholas Chammas] [SPARK-2627] keep up with the PEP 8 fixes 9da347f [nchammas] Merge pull request #2 from apache/master aa5b4b5 [Nicholas Chammas] [SPARK-2627] follow Spark bash style for if blocks d0a83b9 [Nicholas Chammas] [SPARK-2627] check that pep8 downloaded fine dffb5dd [Nicholas Chammas] [SPARK-2627] download pep8 at runtime a1ce7ae [Nicholas Chammas] [SPARK-2627] space out test report sections 21da538 [Nicholas Chammas] [SPARK-2627] it's PEP 8, not PEP8 6f4900b [Nicholas Chammas] [SPARK-2627] more misc PEP 8 fixes fe57ed0 [Nicholas Chammas] removing merge conflict backups 9c01d4c [nchammas] Merge pull request #1 from apache/master 9a66cb0 [Nicholas Chammas] resolving merge conflicts a31ccc4 [Nicholas Chammas] [SPARK-2627] miscellaneous PEP 8 fixes beaa9ac [Nicholas Chammas] [SPARK-2627] fail check on non-zero status 723ed39 [Nicholas Chammas] always delete the report file 0541ebb [Nicholas Chammas] [SPARK-2627] call Python linter from run-tests 12440fa [Nicholas Chammas] [SPARK-2627] add Scala linter 61c07b9 [Nicholas Chammas] [SPARK-2627] add Python linter 75ad552 [Nicholas Chammas] make check output style consistent
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/_common.py5
-rw-r--r--python/pyspark/mllib/classification.py8
-rw-r--r--python/pyspark/mllib/clustering.py3
-rw-r--r--python/pyspark/mllib/linalg.py2
-rw-r--r--python/pyspark/mllib/random.py14
-rw-r--r--python/pyspark/mllib/recommendation.py2
-rw-r--r--python/pyspark/mllib/regression.py12
-rw-r--r--python/pyspark/mllib/stat.py1
-rw-r--r--python/pyspark/mllib/tests.py11
-rw-r--r--python/pyspark/mllib/tree.py4
-rw-r--r--python/pyspark/mllib/util.py1
11 files changed, 50 insertions, 13 deletions
diff --git a/python/pyspark/mllib/_common.py b/python/pyspark/mllib/_common.py
index 9c1565affb..db341da85f 100644
--- a/python/pyspark/mllib/_common.py
+++ b/python/pyspark/mllib/_common.py
@@ -72,9 +72,9 @@ except:
# Python interpreter must agree on what endian the machine is.
-DENSE_VECTOR_MAGIC = 1
+DENSE_VECTOR_MAGIC = 1
SPARSE_VECTOR_MAGIC = 2
-DENSE_MATRIX_MAGIC = 3
+DENSE_MATRIX_MAGIC = 3
LABELED_POINT_MAGIC = 4
@@ -443,6 +443,7 @@ def _serialize_rating(r):
class RatingDeserializer(Serializer):
+
def loads(self, stream):
length = struct.unpack("!i", stream.read(4))[0]
ba = stream.read(length)
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 5ec1a8084d..ffdda7ee19 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -31,6 +31,7 @@ from math import exp, log
class LogisticRegressionModel(LinearModel):
+
"""A linear binary classification model derived from logistic regression.
>>> data = [
@@ -60,6 +61,7 @@ class LogisticRegressionModel(LinearModel):
>>> lrm.predict(SparseVector(2, {1: 0.0})) <= 0
True
"""
+
def predict(self, x):
_linear_predictor_typecheck(x, self._coeff)
margin = _dot(x, self._coeff) + self._intercept
@@ -72,6 +74,7 @@ class LogisticRegressionModel(LinearModel):
class LogisticRegressionWithSGD(object):
+
@classmethod
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
initialWeights=None, regParam=1.0, regType=None, intercept=False):
@@ -108,6 +111,7 @@ class LogisticRegressionWithSGD(object):
class SVMModel(LinearModel):
+
"""A support vector machine.
>>> data = [
@@ -131,6 +135,7 @@ class SVMModel(LinearModel):
>>> svm.predict(SparseVector(2, {0: -1.0})) <= 0
True
"""
+
def predict(self, x):
_linear_predictor_typecheck(x, self._coeff)
margin = _dot(x, self._coeff) + self._intercept
@@ -138,6 +143,7 @@ class SVMModel(LinearModel):
class SVMWithSGD(object):
+
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=1.0,
miniBatchFraction=1.0, initialWeights=None, regType=None, intercept=False):
@@ -173,6 +179,7 @@ class SVMWithSGD(object):
class NaiveBayesModel(object):
+
"""
Model for Naive Bayes classifiers.
@@ -213,6 +220,7 @@ class NaiveBayesModel(object):
class NaiveBayes(object):
+
@classmethod
def train(cls, data, lambda_=1.0):
"""
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index b380e8f6c8..a0630d1d5c 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -27,6 +27,7 @@ from pyspark.mllib.linalg import SparseVector
class KMeansModel(object):
+
"""A clustering model derived from the k-means method.
>>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4,2)
@@ -55,6 +56,7 @@ class KMeansModel(object):
>>> type(model.clusterCenters)
<type 'list'>
"""
+
def __init__(self, centers):
self.centers = centers
@@ -76,6 +78,7 @@ class KMeansModel(object):
class KMeans(object):
+
@classmethod
def train(cls, data, k, maxIterations=100, runs=1, initializationMode="k-means||"):
"""Train a k-means clustering model."""
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 54720c2324..9a239abfbb 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -27,6 +27,7 @@ from numpy import array, array_equal, ndarray, float64, int32
class SparseVector(object):
+
"""
A simple sparse vector class for passing data to MLlib. Users may
alternatively pass SciPy's {scipy.sparse} data types.
@@ -192,6 +193,7 @@ class SparseVector(object):
class Vectors(object):
+
"""
Factory methods for working with vectors. Note that dense vectors
are simply represented as NumPy array objects, so there is no need
diff --git a/python/pyspark/mllib/random.py b/python/pyspark/mllib/random.py
index 36e710dbae..eb496688b6 100644
--- a/python/pyspark/mllib/random.py
+++ b/python/pyspark/mllib/random.py
@@ -24,7 +24,9 @@ from pyspark.rdd import RDD
from pyspark.mllib._common import _deserialize_double, _deserialize_double_vector
from pyspark.serializers import NoOpSerializer
+
class RandomRDDGenerators:
+
"""
Generator methods for creating RDDs comprised of i.i.d samples from
some distribution.
@@ -53,7 +55,7 @@ class RandomRDDGenerators:
True
"""
jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size, numPartitions, seed)
- uniform = RDD(jrdd, sc, NoOpSerializer())
+ uniform = RDD(jrdd, sc, NoOpSerializer())
return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
@staticmethod
@@ -77,7 +79,7 @@ class RandomRDDGenerators:
True
"""
jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
- normal = RDD(jrdd, sc, NoOpSerializer())
+ normal = RDD(jrdd, sc, NoOpSerializer())
return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
@staticmethod
@@ -98,7 +100,7 @@ class RandomRDDGenerators:
True
"""
jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
- poisson = RDD(jrdd, sc, NoOpSerializer())
+ poisson = RDD(jrdd, sc, NoOpSerializer())
return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
@staticmethod
@@ -118,7 +120,7 @@ class RandomRDDGenerators:
"""
jrdd = sc._jvm.PythonMLLibAPI() \
.uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
- uniform = RDD(jrdd, sc, NoOpSerializer())
+ uniform = RDD(jrdd, sc, NoOpSerializer())
return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
@staticmethod
@@ -138,7 +140,7 @@ class RandomRDDGenerators:
"""
jrdd = sc._jvm.PythonMLLibAPI() \
.normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
- normal = RDD(jrdd, sc, NoOpSerializer())
+ normal = RDD(jrdd, sc, NoOpSerializer())
return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
@staticmethod
@@ -161,7 +163,7 @@ class RandomRDDGenerators:
"""
jrdd = sc._jvm.PythonMLLibAPI() \
.poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
- poisson = RDD(jrdd, sc, NoOpSerializer())
+ poisson = RDD(jrdd, sc, NoOpSerializer())
return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 6c385042ff..e863fc249e 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -26,6 +26,7 @@ from pyspark.rdd import RDD
class MatrixFactorizationModel(object):
+
"""A matrix factorisation model trained by regularized alternating
least-squares.
@@ -58,6 +59,7 @@ class MatrixFactorizationModel(object):
class ALS(object):
+
@classmethod
def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
sc = ratings.context
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 041b119269..d8792cf448 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -27,6 +27,7 @@ from pyspark.mllib.linalg import SparseVector, Vectors
class LabeledPoint(object):
+
"""
The features and labels of a data point.
@@ -34,6 +35,7 @@ class LabeledPoint(object):
@param features: Vector of features for this point (NumPy array, list,
pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix)
"""
+
def __init__(self, label, features):
self.label = label
if (type(features) == ndarray or type(features) == SparseVector
@@ -49,7 +51,9 @@ class LabeledPoint(object):
class LinearModel(object):
+
"""A linear model that has a vector of coefficients and an intercept."""
+
def __init__(self, weights, intercept):
self._coeff = weights
self._intercept = intercept
@@ -64,6 +68,7 @@ class LinearModel(object):
class LinearRegressionModelBase(LinearModel):
+
"""A linear regression model.
>>> lrmb = LinearRegressionModelBase(array([1.0, 2.0]), 0.1)
@@ -72,6 +77,7 @@ class LinearRegressionModelBase(LinearModel):
>>> abs(lrmb.predict(SparseVector(2, {0: -1.03, 1: 7.777})) - 14.624) < 1e-6
True
"""
+
def predict(self, x):
"""Predict the value of the dependent variable given a vector x"""
"""containing values for the independent variables."""
@@ -80,6 +86,7 @@ class LinearRegressionModelBase(LinearModel):
class LinearRegressionModel(LinearRegressionModelBase):
+
"""A linear regression model derived from a least-squares fit.
>>> from pyspark.mllib.regression import LabeledPoint
@@ -111,6 +118,7 @@ class LinearRegressionModel(LinearRegressionModelBase):
class LinearRegressionWithSGD(object):
+
@classmethod
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
initialWeights=None, regParam=1.0, regType=None, intercept=False):
@@ -146,6 +154,7 @@ class LinearRegressionWithSGD(object):
class LassoModel(LinearRegressionModelBase):
+
"""A linear regression model derived from a least-squares fit with an
l_1 penalty term.
@@ -178,6 +187,7 @@ class LassoModel(LinearRegressionModelBase):
class LassoWithSGD(object):
+
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=1.0,
miniBatchFraction=1.0, initialWeights=None):
@@ -189,6 +199,7 @@ class LassoWithSGD(object):
class RidgeRegressionModel(LinearRegressionModelBase):
+
"""A linear regression model derived from a least-squares fit with an
l_2 penalty term.
@@ -221,6 +232,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
class RidgeRegressionWithSGD(object):
+
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=1.0,
miniBatchFraction=1.0, initialWeights=None):
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index 0a08a562d1..982906b9d0 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -24,6 +24,7 @@ from pyspark.mllib._common import \
_serialize_double, _serialize_double_vector, \
_deserialize_double, _deserialize_double_matrix
+
class Statistics(object):
@staticmethod
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 9d1e5be637..6f3ec8ac94 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -39,6 +39,7 @@ except:
class VectorTests(unittest.TestCase):
+
def test_serialize(self):
sv = SparseVector(4, {1: 1, 3: 2})
dv = array([1., 2., 3., 4.])
@@ -81,6 +82,7 @@ class VectorTests(unittest.TestCase):
class ListTests(PySparkTestCase):
+
"""
Test MLlib algorithms on plain lists, to make sure they're passed through
as NumPy arrays.
@@ -128,7 +130,7 @@ class ListTests(PySparkTestCase):
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
- categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+ categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = \
DecisionTree.trainClassifier(rdd, numClasses=2,
categoricalFeaturesInfo=categoricalFeaturesInfo)
@@ -168,7 +170,7 @@ class ListTests(PySparkTestCase):
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
- categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+ categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = \
DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
@@ -179,6 +181,7 @@ class ListTests(PySparkTestCase):
@unittest.skipIf(not _have_scipy, "SciPy not installed")
class SciPyTests(PySparkTestCase):
+
"""
Test both vector operations and MLlib algorithms with SciPy sparse matrices,
if SciPy is available.
@@ -276,7 +279,7 @@ class SciPyTests(PySparkTestCase):
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
- categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
+ categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
@@ -315,7 +318,7 @@ class SciPyTests(PySparkTestCase):
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
- categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
+ categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 1e0006df75..2518001ea0 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -25,7 +25,9 @@ from pyspark.mllib._common import \
from pyspark.mllib.regression import LabeledPoint
from pyspark.serializers import NoOpSerializer
+
class DecisionTreeModel(object):
+
"""
A decision tree model for classification or regression.
@@ -77,6 +79,7 @@ class DecisionTreeModel(object):
class DecisionTree(object):
+
"""
Learning algorithm for a decision tree model
for classification or regression.
@@ -174,7 +177,6 @@ class DecisionTree(object):
categoricalFeaturesInfo,
impurity, maxDepth, maxBins)
-
@staticmethod
def train(data, algo, numClasses, categoricalFeaturesInfo,
impurity, maxDepth, maxBins=100):
diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py
index 639cda6350..4962d05491 100644
--- a/python/pyspark/mllib/util.py
+++ b/python/pyspark/mllib/util.py
@@ -26,6 +26,7 @@ from pyspark.serializers import NoOpSerializer
class MLUtils:
+
"""
Helper methods to load, save and pre-process data used in MLlib.
"""