[SPARK-7916] [MLLIB] MLlib Python doc parity check for classification and regression

Check then make the MLlib Python classification and regression doc to be as complete as the Scala doc. Author: Yanbo Liang <ybliang8@gmail.com> Closes #6460 from yanboliang/spark-7916 and squashes the following commits: f8deda4 [Yanbo Liang] trigger jenkins 6dc4d99 [Yanbo Liang] address comments ce2a43e [Yanbo Liang] truncate too long line and remove extra sparse 3eaf6ad [Yanbo Liang] MLlib Python doc parity check for classification and regression
author: Yanbo Liang <ybliang8@gmail.com> 2015-06-16 14:30:30 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-06-16 14:30:30 -0700
commit: ca998757e8ff2bdca2c7e88055c389161521d604 (patch)
tree: 971436419afa30cd08f251dcec429f8396bd20c7 /python/pyspark
parent: cebf2411847706a98dc8df9c754ef53d6d12a87c (diff)
download: spark-ca998757e8ff2bdca2c7e88055c389161521d604.tar.gz
spark-ca998757e8ff2bdca2c7e88055c389161521d604.tar.bz2
spark-ca998757e8ff2bdca2c7e88055c389161521d604.zip
2 files changed, 247 insertions, 107 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index a70c664a71..42e41397bf 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -33,8 +33,8 @@ __all__ = ['LogisticRegressionModel', 'LogisticRegressionWithSGD', 'LogisticRegr
 
 class LinearClassificationModel(LinearModel):
     """
-    A private abstract class representing a multiclass classification model.
-    The categories are represented by int values: 0, 1, 2, etc.
+    A private abstract class representing a multiclass classification
+    model. The categories are represented by int values: 0, 1, 2, etc.
     """
     def __init__(self, weights, intercept):
         super(LinearClassificationModel, self).__init__(weights, intercept)
@@ -44,10 +44,11 @@ class LinearClassificationModel(LinearModel):
         """
         .. note:: Experimental
 
-        Sets the threshold that separates positive predictions from negative
-        predictions. An example with prediction score greater than or equal
-        to this threshold is identified as an positive, and negative otherwise.
-        It is used for binary classification only.
+        Sets the threshold that separates positive predictions from
+        negative predictions. An example with prediction score greater
+        than or equal to this threshold is identified as an positive,
+        and negative otherwise. It is used for binary classification
+        only.
         """
         self._threshold = value
 
@@ -56,8 +57,9 @@ class LinearClassificationModel(LinearModel):
         """
         .. note:: Experimental
 
-        Returns the threshold (if any) used for converting raw prediction scores
-        into 0/1 predictions. It is used for binary classification only.
+        Returns the threshold (if any) used for converting raw
+        prediction scores into 0/1 predictions. It is used for
+        binary classification only.
         """
         return self._threshold
 
@@ -65,22 +67,35 @@ class LinearClassificationModel(LinearModel):
         """
         .. note:: Experimental
 
-        Clears the threshold so that `predict` will output raw prediction scores.
-        It is used for binary classification only.
+        Clears the threshold so that `predict` will output raw
+        prediction scores. It is used for binary classification only.
         """
         self._threshold = None
 
     def predict(self, test):
         """
-        Predict values for a single data point or an RDD of points using
-        the model trained.
+        Predict values for a single data point or an RDD of points
+        using the model trained.
         """
         raise NotImplementedError
 
 
 class LogisticRegressionModel(LinearClassificationModel):
 
-    """A linear binary classification model derived from logistic regression.
+    """
+    Classification model trained using Multinomial/Binary Logistic
+    Regression.
+
+    :param weights: Weights computed for every feature.
+    :param intercept: Intercept computed for this model. (Only used
+            in Binary Logistic Regression. In Multinomial Logistic
+            Regression, the intercepts will not be a single value,
+            so the intercepts will be part of the weights.)
+    :param numFeatures: the dimension of the features.
+    :param numClasses: the number of possible outcomes for k classes
+            classification problem in Multinomial Logistic Regression.
+            By default, it is binary logistic regression so numClasses
+            will be set to 2.
 
     >>> data = [
     ...     LabeledPoint(0.0, [0.0, 1.0]),
@@ -161,8 +176,8 @@ class LogisticRegressionModel(LinearClassificationModel):
 
     def predict(self, x):
         """
-        Predict values for a single data point or an RDD of points using
-        the model trained.
+        Predict values for a single data point or an RDD of points
+        using the model trained.
         """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
@@ -225,16 +240,19 @@ class LogisticRegressionWithSGD(object):
         """
         Train a logistic regression model on the given data.
 
-        :param data:              The training data, an RDD of LabeledPoint.
-        :param iterations:        The number of iterations (default: 100).
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        :param miniBatchFraction: Fraction of data to be used for each SGD
-                                  iteration.
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
         :param initialWeights:    The initial weights (default: None).
-        :param regParam:          The regularizer parameter (default: 0.01).
-        :param regType:           The type of regularizer used for training
-                                  our model.
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param regType:           The type of regularizer used for
+                                  training our model.
 
                                   :Allowed values:
                                      - "l1" for using L1 regularization
@@ -243,13 +261,14 @@ class LogisticRegressionWithSGD(object):
 
                                      (default: "l2")
 
-        :param intercept:         Boolean parameter which indicates the use
-                                  or not of the augmented representation for
-                                  training data (i.e. whether bias features
-                                  are activated or not).
-        :param validateData:      Boolean parameter which indicates if the
-                                  algorithm should validate data before training.
-                                  (default: True)
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
@@ -267,12 +286,15 @@ class LogisticRegressionWithLBFGS(object):
         """
         Train a logistic regression model on the given data.
 
-        :param data:           The training data, an RDD of LabeledPoint.
-        :param iterations:     The number of iterations (default: 100).
+        :param data:           The training data, an RDD of
+                               LabeledPoint.
+        :param iterations:     The number of iterations
+                               (default: 100).
         :param initialWeights: The initial weights (default: None).
-        :param regParam:       The regularizer parameter (default: 0.01).
-        :param regType:        The type of regularizer used for training
-                               our model.
+        :param regParam:       The regularizer parameter
+                               (default: 0.01).
+        :param regType:        The type of regularizer used for
+                               training our model.
 
                                :Allowed values:
                                  - "l1" for using L1 regularization
@@ -281,19 +303,21 @@ class LogisticRegressionWithLBFGS(object):
 
                                  (default: "l2")
 
-        :param intercept:      Boolean parameter which indicates the use
-                               or not of the augmented representation for
-                               training data (i.e. whether bias features
-                               are activated or not).
-        :param corrections:    The number of corrections used in the LBFGS
-                               update (default: 10).
-        :param tolerance:      The convergence tolerance of iterations for
-                               L-BFGS (default: 1e-4).
+        :param intercept:      Boolean parameter which indicates the
+                               use or not of the augmented representation
+                               for training data (i.e. whether bias
+                               features are activated or not,
+                               default: False).
+        :param corrections:    The number of corrections used in the
+                               LBFGS update (default: 10).
+        :param tolerance:      The convergence tolerance of iterations
+                               for L-BFGS (default: 1e-4).
         :param validateData:   Boolean parameter which indicates if the
-                               algorithm should validate data before training.
-                               (default: True)
-        :param numClasses:     The number of classes (i.e., outcomes) a label can take
-                               in Multinomial Logistic Regression (default: 2).
+                               algorithm should validate data before
+                               training. (default: True)
+        :param numClasses:     The number of classes (i.e., outcomes) a
+                               label can take in Multinomial Logistic
+                               Regression (default: 2).
 
         >>> data = [
         ...     LabeledPoint(0.0, [0.0, 1.0]),
@@ -323,7 +347,11 @@ class LogisticRegressionWithLBFGS(object):
 
 class SVMModel(LinearClassificationModel):
 
-    """A support vector machine.
+    """
+    Model for Support Vector Machines (SVMs).
+
+    :param weights: Weights computed for every feature.
+    :param intercept: Intercept computed for this model.
 
     >>> data = [
     ...     LabeledPoint(0.0, [0.0]),
@@ -370,8 +398,8 @@ class SVMModel(LinearClassificationModel):
 
     def predict(self, x):
         """
-        Predict values for a single data point or an RDD of points using
-        the model trained.
+        Predict values for a single data point or an RDD of points
+        using the model trained.
         """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
@@ -409,16 +437,19 @@ class SVMWithSGD(object):
         """
         Train a support vector machine on the given data.
 
-        :param data:              The training data, an RDD of LabeledPoint.
-        :param iterations:        The number of iterations (default: 100).
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        :param regParam:          The regularizer parameter (default: 0.01).
-        :param miniBatchFraction: Fraction of data to be used for each SGD
-                                  iteration.
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
         :param initialWeights:    The initial weights (default: None).
-        :param regType:           The type of regularizer used for training
-                                  our model.
+        :param regType:           The type of regularizer used for
+                                  training our model.
 
                                   :Allowed values:
                                      - "l1" for using L1 regularization
@@ -427,13 +458,14 @@ class SVMWithSGD(object):
 
                                      (default: "l2")
 
-        :param intercept:         Boolean parameter which indicates the use
-                                  or not of the augmented representation for
-                                  training data (i.e. whether bias features
-                                  are activated or not).
-        :param validateData:      Boolean parameter which indicates if the
-                                  algorithm should validate data before training.
-                                  (default: True)
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
         """
         def train(rdd, i):
             return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
@@ -449,9 +481,11 @@ class NaiveBayesModel(Saveable, Loader):
     """
     Model for Naive Bayes classifiers.
 
-    Contains two parameters:
-    - pi: vector of logs of class priors (dimension C)
-    - theta: matrix of logs of class conditional probabilities (CxD)
+    :param labels: list of labels.
+    :param pi: log of class priors, whose dimension is C,
+            number of labels.
+    :param theta: log of class conditional probabilities, whose
+            dimension is C-by-D, where D is number of features.
 
     >>> data = [
     ...     LabeledPoint(0.0, [0.0, 0.0]),
@@ -493,7 +527,10 @@ class NaiveBayesModel(Saveable, Loader):
         self.theta = theta
 
     def predict(self, x):
-        """Return the most likely class for a data vector or an RDD of vectors"""
+        """
+        Return the most likely class for a data vector
+        or an RDD of vectors
+        """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
         x = _convert_to_vector(x)
@@ -523,16 +560,18 @@ class NaiveBayes(object):
     @classmethod
     def train(cls, data, lambda_=1.0):
         """
-        Train a Naive Bayes model given an RDD of (label, features) vectors.
+        Train a Naive Bayes model given an RDD of (label, features)
+        vectors.
 
-        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
-        handle all kinds of discrete data.  For example, by converting
-        documents into TF-IDF vectors, it can be used for document
-        classification.  By making every vector a 0-1 vector, it can also be
-        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
+        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which
+        can handle all kinds of discrete data.  For example, by
+        converting documents into TF-IDF vectors, it can be used for
+        document classification. By making every vector a 0-1 vector,
+        it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
+        The input feature values must be nonnegative.
 
         :param data: RDD of LabeledPoint.
-        :param lambda_: The smoothing parameter
+        :param lambda_: The smoothing parameter (default: 1.0).
         """
         first = data.first()
         if not isinstance(first, LabeledPoint):
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 41bde2ce3e..0c4d7d3bbe 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -33,12 +33,12 @@ __all__ = ['LabeledPoint', 'LinearModel',
 class LabeledPoint(object):
 
     """
-    The features and labels of a data point.
+    Class that represents the features and labels of a data point.
 
     :param label: Label for this data point.
     :param features: Vector of features for this point (NumPy array,
-             list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
-             column matrix)
+            list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
+            column matrix)
 
     Note: 'label' and 'features' are accessible as class attributes.
     """
@@ -59,7 +59,12 @@ class LabeledPoint(object):
 
 class LinearModel(object):
 
-    """A linear model that has a vector of coefficients and an intercept."""
+    """
+    A linear model that has a vector of coefficients and an intercept.
+
+    :param weights: Weights computed for every feature.
+    :param intercept: Intercept computed for this model.
+    """
 
     def __init__(self, weights, intercept):
         self._coeff = _convert_to_vector(weights)
@@ -193,18 +198,28 @@ class LinearRegressionWithSGD(object):
               initialWeights=None, regParam=0.0, regType=None, intercept=False,
               validateData=True):
         """
-        Train a linear regression model on the given data.
-
-        :param data:              The training data.
-        :param iterations:        The number of iterations (default: 100).
+        Train a linear regression model using Stochastic Gradient
+        Descent (SGD).
+        This solves the least squares regression formulation
+                f(weights) = 1/n ||A weights-y||^2^
+        (which is the mean squared error).
+        Here the data matrix has n rows, and the input RDD holds the
+        set of rows of A, each with its corresponding right hand side
+        label y. See also the documentation for the precise formulation.
+
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
-        :param miniBatchFraction: Fraction of data to be used for each SGD
-                                  iteration.
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
         :param initialWeights:    The initial weights (default: None).
-        :param regParam:          The regularizer parameter (default: 0.0).
-        :param regType:           The type of regularizer used for training
-                                  our model.
+        :param regParam:          The regularizer parameter
+                                  (default: 0.0).
+        :param regType:           The type of regularizer used for
+                                  training our model.
 
                                   :Allowed values:
                                      - "l1" for using L1 regularization (lasso),
@@ -213,13 +228,14 @@ class LinearRegressionWithSGD(object):
 
                                      (default: None)
 
-        :param intercept:         Boolean parameter which indicates the use
-                                  or not of the augmented representation for
-                                  training data (i.e. whether bias features
-                                  are activated or not). (default: False)
-        :param validateData:      Boolean parameter which indicates if the
-                                  algorithm should validate data before training.
-                                  (default: True)
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
         """
         def train(rdd, i):
             return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -232,8 +248,8 @@ class LinearRegressionWithSGD(object):
 @inherit_doc
 class LassoModel(LinearRegressionModelBase):
 
-    """A linear regression model derived from a least-squares fit with an
-    l_1 penalty term.
+    """A linear regression model derived from a least-squares fit with
+    an l_1 penalty term.
 
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
@@ -304,7 +320,36 @@ class LassoWithSGD(object):
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True):
-        """Train a Lasso regression model on the given data."""
+        """
+        Train a regression model with L1-regularization using
+        Stochastic Gradient Descent.
+        This solves the l1-regularized least squares regression
+        formulation
+            f(weights) = 1/2n ||A weights-y||^2^  + regParam ||weights||_1
+        Here the data matrix has n rows, and the input RDD holds the
+        set of rows of A, each with its corresponding right hand side
+        label y. See also the documentation for the precise formulation.
+
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
+        :param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
+        :param initialWeights:    The initial weights (default: None).
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
+        """
         def train(rdd, i):
             return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
@@ -316,8 +361,8 @@ class LassoWithSGD(object):
 @inherit_doc
 class RidgeRegressionModel(LinearRegressionModelBase):
 
-    """A linear regression model derived from a least-squares fit with an
-    l_2 penalty term.
+    """A linear regression model derived from a least-squares fit with
+    an l_2 penalty term.
 
     >>> from pyspark.mllib.regression import LabeledPoint
     >>> data = [
@@ -389,7 +434,36 @@ class RidgeRegressionWithSGD(object):
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, intercept=False,
               validateData=True):
-        """Train a ridge regression model on the given data."""
+        """
+        Train a regression model with L2-regularization using
+        Stochastic Gradient Descent.
+        This solves the l2-regularized least squares regression
+        formulation
+            f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^
+        Here the data matrix has n rows, and the input RDD holds the
+        set of rows of A, each with its corresponding right hand side
+        label y. See also the documentation for the precise formulation.
+
+        :param data:              The training data, an RDD of
+                                  LabeledPoint.
+        :param iterations:        The number of iterations
+                                  (default: 100).
+        :param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        :param regParam:          The regularizer parameter
+                                  (default: 0.01).
+        :param miniBatchFraction: Fraction of data to be used for each
+                                  SGD iteration (default: 1.0).
+        :param initialWeights:    The initial weights (default: None).
+        :param intercept:         Boolean parameter which indicates the
+                                  use or not of the augmented representation
+                                  for training data (i.e. whether bias
+                                  features are activated or not,
+                                  default: False).
+        :param validateData:      Boolean parameter which indicates if
+                                  the algorithm should validate data
+                                  before training. (default: True)
+        """
         def train(rdd, i):
             return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
                                  float(regParam), float(miniBatchFraction), i, bool(intercept),
@@ -400,7 +474,15 @@ class RidgeRegressionWithSGD(object):
 
 class IsotonicRegressionModel(Saveable, Loader):
 
-    """Regression model for isotonic regression.
+    """
+    Regression model for isotonic regression.
+
+    :param boundaries: Array of boundaries for which predictions are
+            known. Boundaries must be sorted in increasing order.
+    :param predictions: Array of predictions associated to the
+            boundaries at the same index. Results of isotonic
+            regression and therefore monotone.
+    :param isotonic: indicates whether this is isotonic or antitonic.
 
     >>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
     >>> irm = IsotonicRegression.train(sc.parallelize(data))
@@ -430,6 +512,25 @@ class IsotonicRegressionModel(Saveable, Loader):
         self.isotonic = isotonic
 
     def predict(self, x):
+        """
+        Predict labels for provided features.
+        Using a piecewise linear function.
+        1) If x exactly matches a boundary then associated prediction
+        is returned. In case there are multiple predictions with the
+        same boundary then one of them is returned. Which one is
+        undefined (same as java.util.Arrays.binarySearch).
+        2) If x is lower or higher than all boundaries then first or
+        last prediction is returned respectively. In case there are
+        multiple predictions with the same boundary then the lowest
+        or highest is returned respectively.
+        3) If x falls between two values in boundary array then
+        prediction is treated as piecewise linear function and
+        interpolated value is returned. In case there are multiple
+        values with the same boundary then the same rules as in 2)
+        are used.
+
+        :param x: Feature or RDD of Features to be labeled.
+        """
         if isinstance(x, RDD):
             return x.map(lambda v: self.predict(v))
         return np.interp(x, self.boundaries, self.predictions)
@@ -451,15 +552,15 @@ class IsotonicRegressionModel(Saveable, Loader):
 
 
 class IsotonicRegression(object):
-    """
-    Run IsotonicRegression algorithm to obtain isotonic regression model.
 
-    :param data:            RDD of (label, feature, weight) tuples.
-    :param isotonic:        Whether this is isotonic or antitonic.
-    """
     @classmethod
     def train(cls, data, isotonic=True):
-        """Train a isotonic regression model on the given data."""
+        """
+        Train a isotonic regression model on the given data.
+
+        :param data: RDD of (label, feature, weight) tuples.
+        :param isotonic: Whether this is isotonic or antitonic.
+        """
         boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
                                                 data.map(_convert_to_vector), bool(isotonic))
         return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)
author	Yanbo Liang <ybliang8@gmail.com>	2015-06-16 14:30:30 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-06-16 14:30:30 -0700
commit	ca998757e8ff2bdca2c7e88055c389161521d604 (patch)
tree	971436419afa30cd08f251dcec429f8396bd20c7 /python/pyspark
parent	cebf2411847706a98dc8df9c754ef53d6d12a87c (diff)
download	spark-ca998757e8ff2bdca2c7e88055c389161521d604.tar.gz spark-ca998757e8ff2bdca2c7e88055c389161521d604.tar.bz2 spark-ca998757e8ff2bdca2c7e88055c389161521d604.zip