aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/mllib-guide.md9
-rw-r--r--python/pyspark/mllib/classification.py14
-rw-r--r--python/pyspark/mllib/regression.py28
3 files changed, 30 insertions, 21 deletions
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index c977bc4f35..1a5c640d10 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -21,6 +21,8 @@ depends on native Fortran routines. You may need to install the
if it is not already present on your nodes. MLlib will throw a linking error if it cannot
detect these libraries automatically.
+To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer.
+
# Binary Classification
Binary classification is a supervised learning problem in which we want to
@@ -316,6 +318,13 @@ other signals), you can use the trainImplicit method to get better results.
val model = ALS.trainImplicit(ratings, 1, 20, 0.01)
{% endhighlight %}
+# Using MLLib in Java
+
+All of MLlib's methods use Java-friendly types, so you can import and call them there the same
+way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
+Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
+calling `.rdd()` on your `JavaRDD` object.
+
# Using MLLib in Python
Following examples can be tested in the PySpark shell.
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 03ff5a572e..19b90dfd6e 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -44,13 +44,13 @@ class LogisticRegressionModel(LinearModel):
class LogisticRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0,
- mini_batch_fraction=1.0, initial_weights=None):
+ miniBatchFraction=1.0, initialWeights=None):
"""Train a logistic regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainLogisticRegressionModelWithSGD(d._jrdd,
- iterations, step, mini_batch_fraction, i),
- LogisticRegressionModel, data, initial_weights)
+ iterations, step, miniBatchFraction, i),
+ LogisticRegressionModel, data, initialWeights)
class SVMModel(LinearModel):
"""A support vector machine.
@@ -67,14 +67,14 @@ class SVMModel(LinearModel):
class SVMWithSGD(object):
@classmethod
- def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
- mini_batch_fraction=1.0, initial_weights=None):
+ def train(cls, data, iterations=100, step=1.0, regParam=1.0,
+ miniBatchFraction=1.0, initialWeights=None):
"""Train a support vector machine on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainSVMModelWithSGD(d._jrdd,
- iterations, step, reg_param, mini_batch_fraction, i),
- SVMModel, data, initial_weights)
+ iterations, step, regParam, miniBatchFraction, i),
+ SVMModel, data, initialWeights)
class NaiveBayesModel(object):
"""
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index e90b72893f..7656db07f6 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -47,57 +47,57 @@ class LinearRegressionModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit.
>>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
- >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
+ >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
"""
class LinearRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0,
- mini_batch_fraction=1.0, initial_weights=None):
+ miniBatchFraction=1.0, initialWeights=None):
"""Train a linear regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD(
- d._jrdd, iterations, step, mini_batch_fraction, i),
- LinearRegressionModel, data, initial_weights)
+ d._jrdd, iterations, step, miniBatchFraction, i),
+ LinearRegressionModel, data, initialWeights)
class LassoModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit with an
l_1 penalty term.
>>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
- >>> lrm = LassoWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
+ >>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
"""
class LassoWithSGD(object):
@classmethod
- def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
- mini_batch_fraction=1.0, initial_weights=None):
+ def train(cls, data, iterations=100, step=1.0, regParam=1.0,
+ miniBatchFraction=1.0, initialWeights=None):
"""Train a Lasso regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainLassoModelWithSGD(d._jrdd,
- iterations, step, reg_param, mini_batch_fraction, i),
- LassoModel, data, initial_weights)
+ iterations, step, regParam, miniBatchFraction, i),
+ LassoModel, data, initialWeights)
class RidgeRegressionModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit with an
l_2 penalty term.
>>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2)
- >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0]))
+ >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0]))
"""
class RidgeRegressionWithSGD(object):
@classmethod
- def train(cls, data, iterations=100, step=1.0, reg_param=1.0,
- mini_batch_fraction=1.0, initial_weights=None):
+ def train(cls, data, iterations=100, step=1.0, regParam=1.0,
+ miniBatchFraction=1.0, initialWeights=None):
"""Train a ridge regression model on the given data."""
sc = data.context
return _regression_train_wrapper(sc, lambda d, i:
sc._jvm.PythonMLLibAPI().trainRidgeModelWithSGD(d._jrdd,
- iterations, step, reg_param, mini_batch_fraction, i),
- RidgeRegressionModel, data, initial_weights)
+ iterations, step, regParam, miniBatchFraction, i),
+ RidgeRegressionModel, data, initialWeights)
def _test():
import doctest