aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorBryan Cutler <bjcutler@us.ibm.com>2015-11-23 17:11:51 -0800
committerJoseph K. Bradley <joseph@databricks.com>2015-11-23 17:11:51 -0800
commit105745645b12afbbc2a350518cb5853a88944183 (patch)
tree258d010b9eb9702a4e809ef84680e4adbfa390fd /python
parent9db5f601facfdaba6e4333a6b2d2e4a9f009c788 (diff)
downloadspark-105745645b12afbbc2a350518cb5853a88944183.tar.gz
spark-105745645b12afbbc2a350518cb5853a88944183.tar.bz2
spark-105745645b12afbbc2a350518cb5853a88944183.zip
[SPARK-10560][PYSPARK][MLLIB][DOCS] Make StreamingLogisticRegressionWithSGD Python API equal to Scala one
This is to bring the API documentation of StreamingLogisticReressionWithSGD and StreamingLinearRegressionWithSGC in line with the Scala versions. -Fixed the algorithm descriptions -Added default values to parameter descriptions -Changed StreamingLogisticRegressionWithSGD regParam to default to 0, as in the Scala version Author: Bryan Cutler <bjcutler@us.ibm.com> Closes #9141 from BryanCutler/StreamingLogisticRegressionWithSGD-python-api-sync.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/classification.py37
-rw-r--r--python/pyspark/mllib/regression.py32
2 files changed, 46 insertions, 23 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index aab4015ba8..9e6f17ef6e 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -652,21 +652,34 @@ class NaiveBayes(object):
@inherit_doc
class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
"""
- Run LogisticRegression with SGD on a batch of data.
-
- The weights obtained at the end of training a stream are used as initial
- weights for the next batch.
-
- :param stepSize: Step size for each iteration of gradient descent.
- :param numIterations: Number of iterations run for each batch of data.
- :param miniBatchFraction: Fraction of data on which SGD is run for each
- iteration.
- :param regParam: L2 Regularization parameter.
- :param convergenceTol: A condition which decides iteration termination.
+ Train or predict a logistic regression model on streaming data. Training uses
+ Stochastic Gradient Descent to update the model based on each new batch of
+ incoming data from a DStream.
+
+ Each batch of data is assumed to be an RDD of LabeledPoints.
+ The number of data points per batch can vary, but the number
+ of features must be constant. An initial weight
+ vector must be provided.
+
+ :param stepSize:
+ Step size for each iteration of gradient descent.
+ (default: 0.1)
+ :param numIterations:
+ Number of iterations run for each batch of data.
+ (default: 50)
+ :param miniBatchFraction:
+ Fraction of each batch of data to use for updates.
+ (default: 1.0)
+ :param regParam:
+ L2 Regularization parameter.
+ (default: 0.0)
+ :param convergenceTol:
+ Value used to determine when to terminate iterations.
+ (default: 0.001)
.. versionadded:: 1.5.0
"""
- def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
+ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.0,
convergenceTol=0.001):
self.stepSize = stepSize
self.numIterations = numIterations
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 6f00d1df20..13b3397501 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -734,17 +734,27 @@ class StreamingLinearAlgorithm(object):
@inherit_doc
class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
"""
- Run LinearRegression with SGD on a batch of data.
-
- The problem minimized is (1 / n_samples) * (y - weights'X)**2.
- After training on a batch of data, the weights obtained at the end of
- training are used as initial weights for the next batch.
-
- :param stepSize: Step size for each iteration of gradient descent.
- :param numIterations: Total number of iterations run.
- :param miniBatchFraction: Fraction of data on which SGD is run for each
- iteration.
- :param convergenceTol: A condition which decides iteration termination.
+ Train or predict a linear regression model on streaming data. Training uses
+ Stochastic Gradient Descent to update the model based on each new batch of
+ incoming data from a DStream (see `LinearRegressionWithSGD` for model equation).
+
+ Each batch of data is assumed to be an RDD of LabeledPoints.
+ The number of data points per batch can vary, but the number
+ of features must be constant. An initial weight
+ vector must be provided.
+
+ :param stepSize:
+ Step size for each iteration of gradient descent.
+ (default: 0.1)
+ :param numIterations:
+ Number of iterations run for each batch of data.
+ (default: 50)
+ :param miniBatchFraction:
+ Fraction of each batch of data to use for updates.
+ (default: 1.0)
+ :param convergenceTol:
+ Value used to determine when to terminate iterations.
+ (default: 0.001)
.. versionadded:: 1.5.0
"""