aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authornoelsmith <mail@noelsmith.com>2015-10-20 16:14:20 -0700
committerXiangrui Meng <meng@databricks.com>2015-10-20 16:14:20 -0700
commit04521ea067d6ed3c5398067f07904d27c77017ff (patch)
tree034c802f54a716223e7952900eec33436312a6d7 /python
parent9f49895fefc294ef40b2e974f1f8b311087c54df (diff)
downloadspark-04521ea067d6ed3c5398067f07904d27c77017ff.tar.gz
spark-04521ea067d6ed3c5398067f07904d27c77017ff.tar.bz2
spark-04521ea067d6ed3c5398067f07904d27c77017ff.zip
[SPARK-10269][PYSPARK][MLLIB] Add @since annotation to pyspark.mllib.classification
Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings). Added since to methods + "versionadded::" to classes derived from the file history. Note - some methods are inherited from the regression module (i.e. LinearModel.intercept) so these won't have version numbers in the API docs until that model is updated. Author: noelsmith <mail@noelsmith.com> Closes #8626 from noel-smith/SPARK-10269-since-mlib-classification.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/classification.py70
1 files changed, 66 insertions, 4 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index b77754500b..aab4015ba8 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -20,7 +20,7 @@ from math import exp
import numpy
from numpy import array
-from pyspark import RDD
+from pyspark import RDD, since
from pyspark.streaming import DStream
from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
@@ -44,6 +44,7 @@ class LinearClassificationModel(LinearModel):
super(LinearClassificationModel, self).__init__(weights, intercept)
self._threshold = None
+ @since('1.4.0')
def setThreshold(self, value):
"""
.. note:: Experimental
@@ -57,6 +58,7 @@ class LinearClassificationModel(LinearModel):
self._threshold = value
@property
+ @since('1.4.0')
def threshold(self):
"""
.. note:: Experimental
@@ -67,6 +69,7 @@ class LinearClassificationModel(LinearModel):
"""
return self._threshold
+ @since('1.4.0')
def clearThreshold(self):
"""
.. note:: Experimental
@@ -76,6 +79,7 @@ class LinearClassificationModel(LinearModel):
"""
self._threshold = None
+ @since('1.4.0')
def predict(self, test):
"""
Predict values for a single data point or an RDD of points
@@ -157,6 +161,8 @@ class LogisticRegressionModel(LinearClassificationModel):
1
>>> mcm.predict([0.0, 0.0, 0.3])
2
+
+ .. versionadded:: 0.9.0
"""
def __init__(self, weights, intercept, numFeatures, numClasses):
super(LogisticRegressionModel, self).__init__(weights, intercept)
@@ -172,13 +178,23 @@ class LogisticRegressionModel(LinearClassificationModel):
self._dataWithBiasSize)
@property
+ @since('1.4.0')
def numFeatures(self):
+ """
+ Dimension of the features.
+ """
return self._numFeatures
@property
+ @since('1.4.0')
def numClasses(self):
+ """
+ Number of possible outcomes for k classes classification problem in Multinomial
+ Logistic Regression.
+ """
return self._numClasses
+ @since('0.9.0')
def predict(self, x):
"""
Predict values for a single data point or an RDD of points
@@ -217,13 +233,21 @@ class LogisticRegressionModel(LinearClassificationModel):
best_class = i + 1
return best_class
+ @since('1.4.0')
def save(self, sc, path):
+ """
+ Save this model to the given path.
+ """
java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
_py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)
java_model.save(sc._jsc.sc(), path)
@classmethod
+ @since('1.4.0')
def load(cls, sc, path):
+ """
+ Load a model from the given path.
+ """
java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load(
sc._jsc.sc(), path)
weights = _java2py(sc, java_model.weights())
@@ -237,8 +261,11 @@ class LogisticRegressionModel(LinearClassificationModel):
class LogisticRegressionWithSGD(object):
-
+ """
+ .. versionadded:: 0.9.0
+ """
@classmethod
+ @since('0.9.0')
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
initialWeights=None, regParam=0.01, regType="l2", intercept=False,
validateData=True, convergenceTol=0.001):
@@ -286,8 +313,11 @@ class LogisticRegressionWithSGD(object):
class LogisticRegressionWithLBFGS(object):
-
+ """
+ .. versionadded:: 1.2.0
+ """
@classmethod
+ @since('1.2.0')
def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType="l2",
intercept=False, corrections=10, tolerance=1e-4, validateData=True, numClasses=2):
"""
@@ -399,11 +429,14 @@ class SVMModel(LinearClassificationModel):
... rmtree(path)
... except:
... pass
+
+ .. versionadded:: 0.9.0
"""
def __init__(self, weights, intercept):
super(SVMModel, self).__init__(weights, intercept)
self._threshold = 0.0
+ @since('0.9.0')
def predict(self, x):
"""
Predict values for a single data point or an RDD of points
@@ -419,13 +452,21 @@ class SVMModel(LinearClassificationModel):
else:
return 1 if margin > self._threshold else 0
+ @since('1.4.0')
def save(self, sc, path):
+ """
+ Save this model to the given path.
+ """
java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(
_py2java(sc, self._coeff), self.intercept)
java_model.save(sc._jsc.sc(), path)
@classmethod
+ @since('1.4.0')
def load(cls, sc, path):
+ """
+ Load a model from the given path.
+ """
java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
sc._jsc.sc(), path)
weights = _java2py(sc, java_model.weights())
@@ -437,8 +478,12 @@ class SVMModel(LinearClassificationModel):
class SVMWithSGD(object):
+ """
+ .. versionadded:: 0.9.0
+ """
@classmethod
+ @since('0.9.0')
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None, regType="l2",
intercept=False, validateData=True, convergenceTol=0.001):
@@ -530,13 +575,15 @@ class NaiveBayesModel(Saveable, Loader):
... rmtree(path)
... except OSError:
... pass
- """
+ .. versionadded:: 0.9.0
+ """
def __init__(self, labels, pi, theta):
self.labels = labels
self.pi = pi
self.theta = theta
+ @since('0.9.0')
def predict(self, x):
"""
Return the most likely class for a data vector
@@ -548,6 +595,9 @@ class NaiveBayesModel(Saveable, Loader):
return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
def save(self, sc, path):
+ """
+ Save this model to the given path.
+ """
java_labels = _py2java(sc, self.labels.tolist())
java_pi = _py2java(sc, self.pi.tolist())
java_theta = _py2java(sc, self.theta.tolist())
@@ -556,7 +606,11 @@ class NaiveBayesModel(Saveable, Loader):
java_model.save(sc._jsc.sc(), path)
@classmethod
+ @since('1.4.0')
def load(cls, sc, path):
+ """
+ Load a model from the given path.
+ """
java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
sc._jsc.sc(), path)
# Can not unpickle array.array from Pyrolite in Python3 with "bytes"
@@ -567,8 +621,12 @@ class NaiveBayesModel(Saveable, Loader):
class NaiveBayes(object):
+ """
+ .. versionadded:: 0.9.0
+ """
@classmethod
+ @since('0.9.0')
def train(cls, data, lambda_=1.0):
"""
Train a Naive Bayes model given an RDD of (label, features)
@@ -605,6 +663,8 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
iteration.
:param regParam: L2 Regularization parameter.
:param convergenceTol: A condition which decides iteration termination.
+
+ .. versionadded:: 1.5.0
"""
def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
convergenceTol=0.001):
@@ -617,6 +677,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
super(StreamingLogisticRegressionWithSGD, self).__init__(
model=self._model)
+ @since('1.5.0')
def setInitialWeights(self, initialWeights):
"""
Set the initial value of weights.
@@ -630,6 +691,7 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
initialWeights, 0, initialWeights.size, 2)
return self
+ @since('1.5.0')
def trainOn(self, dstream):
"""Train the model on the incoming dstream."""
self._validate(dstream)