diff options
-rw-r--r-- | docs/mllib-guide.md | 9 | ||||
-rw-r--r-- | python/pyspark/mllib/classification.py | 14 | ||||
-rw-r--r-- | python/pyspark/mllib/regression.py | 28 |
3 files changed, 30 insertions, 21 deletions
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index c977bc4f35..1a5c640d10 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -21,6 +21,8 @@ depends on native Fortran routines. You may need to install the if it is not already present on your nodes. MLlib will throw a linking error if it cannot detect these libraries automatically. +To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer. + # Binary Classification Binary classification is a supervised learning problem in which we want to @@ -316,6 +318,13 @@ other signals), you can use the trainImplicit method to get better results. val model = ALS.trainImplicit(ratings, 1, 20, 0.01) {% endhighlight %} +# Using MLLib in Java + +All of MLlib's methods use Java-friendly types, so you can import and call them there the same +way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the +Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by +calling `.rdd()` on your `JavaRDD` object. + # Using MLLib in Python Following examples can be tested in the PySpark shell. diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 03ff5a572e..19b90dfd6e 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -44,13 +44,13 @@ class LogisticRegressionModel(LinearModel): class LogisticRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, - mini_batch_fraction=1.0, initial_weights=None): + miniBatchFraction=1.0, initialWeights=None): """Train a logistic regression model on the given data.""" sc = data.context return _regression_train_wrapper(sc, lambda d, i: sc._jvm.PythonMLLibAPI().trainLogisticRegressionModelWithSGD(d._jrdd, - iterations, step, mini_batch_fraction, i), - LogisticRegressionModel, data, initial_weights) + iterations, step, miniBatchFraction, i), + LogisticRegressionModel, data, initialWeights) class SVMModel(LinearModel): """A support vector machine. @@ -67,14 +67,14 @@ class SVMModel(LinearModel): class SVMWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, reg_param=1.0, - mini_batch_fraction=1.0, initial_weights=None): + def train(cls, data, iterations=100, step=1.0, regParam=1.0, + miniBatchFraction=1.0, initialWeights=None): """Train a support vector machine on the given data.""" sc = data.context return _regression_train_wrapper(sc, lambda d, i: sc._jvm.PythonMLLibAPI().trainSVMModelWithSGD(d._jrdd, - iterations, step, reg_param, mini_batch_fraction, i), - SVMModel, data, initial_weights) + iterations, step, regParam, miniBatchFraction, i), + SVMModel, data, initialWeights) class NaiveBayesModel(object): """ diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index e90b72893f..7656db07f6 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -47,57 +47,57 @@ class LinearRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit. >>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2) - >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0])) + >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0])) """ class LinearRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, - mini_batch_fraction=1.0, initial_weights=None): + miniBatchFraction=1.0, initialWeights=None): """Train a linear regression model on the given data.""" sc = data.context return _regression_train_wrapper(sc, lambda d, i: sc._jvm.PythonMLLibAPI().trainLinearRegressionModelWithSGD( - d._jrdd, iterations, step, mini_batch_fraction, i), - LinearRegressionModel, data, initial_weights) + d._jrdd, iterations, step, miniBatchFraction, i), + LinearRegressionModel, data, initialWeights) class LassoModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_1 penalty term. >>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2) - >>> lrm = LassoWithSGD.train(sc.parallelize(data), initial_weights=array([1.0])) + >>> lrm = LassoWithSGD.train(sc.parallelize(data), initialWeights=array([1.0])) """ class LassoWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, reg_param=1.0, - mini_batch_fraction=1.0, initial_weights=None): + def train(cls, data, iterations=100, step=1.0, regParam=1.0, + miniBatchFraction=1.0, initialWeights=None): """Train a Lasso regression model on the given data.""" sc = data.context return _regression_train_wrapper(sc, lambda d, i: sc._jvm.PythonMLLibAPI().trainLassoModelWithSGD(d._jrdd, - iterations, step, reg_param, mini_batch_fraction, i), - LassoModel, data, initial_weights) + iterations, step, regParam, miniBatchFraction, i), + LassoModel, data, initialWeights) class RidgeRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an l_2 penalty term. >>> data = array([0.0, 0.0, 1.0, 1.0, 3.0, 2.0, 2.0, 3.0]).reshape(4,2) - >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initial_weights=array([1.0])) + >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), initialWeights=array([1.0])) """ class RidgeRegressionWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, reg_param=1.0, - mini_batch_fraction=1.0, initial_weights=None): + def train(cls, data, iterations=100, step=1.0, regParam=1.0, + miniBatchFraction=1.0, initialWeights=None): """Train a ridge regression model on the given data.""" sc = data.context return _regression_train_wrapper(sc, lambda d, i: sc._jvm.PythonMLLibAPI().trainRidgeModelWithSGD(d._jrdd, - iterations, step, reg_param, mini_batch_fraction, i), - RidgeRegressionModel, data, initial_weights) + iterations, step, regParam, miniBatchFraction, i), + RidgeRegressionModel, data, initialWeights) def _test(): import doctest |