aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-02-20 02:31:32 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-20 02:31:32 -0800
commit4a17eedb16343413e5b6f8bb58c6da8952ee7ab6 (patch)
treeac17fd9eed4f42ba3095b148c68d4e78b6afc875 /python/pyspark/mllib
parentd3dfebebce9f76e4433e16d4d6d29fb8fa4d4193 (diff)
downloadspark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.gz
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.tar.bz2
spark-4a17eedb16343413e5b6f8bb58c6da8952ee7ab6.zip
[SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release
For SPARK-5867: * The spark.ml programming guide needs to be updated to use the new SQL DataFrame API instead of the old SchemaRDD API. * It should also include Python examples now. For SPARK-5892: * Fix Python docs * Various other cleanups BTW, I accidentally merged this with master. If you want to compile it on your own, use this branch which is based on spark/branch-1.3 and cherry-picks the commits from this PR: [https://github.com/jkbradley/spark/tree/doc-review-1.3-check] CC: mengxr (ML), davies (Python docs) Author: Joseph K. Bradley <joseph@databricks.com> Closes #4675 from jkbradley/doc-review-1.3 and squashes the following commits: f191bb0 [Joseph K. Bradley] small cleanups e786efa [Joseph K. Bradley] small doc corrections 6b1ab4a [Joseph K. Bradley] fixed python lint test 946affa [Joseph K. Bradley] Added sample data for ml.MovieLensALS example. Changed spark.ml Java examples to use DataFrames API instead of sql() da81558 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into doc-review-1.3 629dbf5 [Joseph K. Bradley] Updated based on code review: * made new page for old migration guides * small fixes * moved inherit_doc in python b9df7c4 [Joseph K. Bradley] Small cleanups: toDF to toDF(), adding s for string interpolation 34b067f [Joseph K. Bradley] small doc correction da16aef [Joseph K. Bradley] Fixed python mllib docs 8cce91c [Joseph K. Bradley] GMM: removed old imports, added some doc 695f3f6 [Joseph K. Bradley] partly done trying to fix inherit_doc for class hierarchies in python docs a72c018 [Joseph K. Bradley] made ChiSqTestResult appear in python docs b05a80d [Joseph K. Bradley] organize imports. doc cleanups e572827 [Joseph K. Bradley] updated programming guide for ml and mllib
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/__init__.py2
-rw-r--r--python/pyspark/mllib/clustering.py2
-rw-r--r--python/pyspark/mllib/common.py17
-rw-r--r--python/pyspark/mllib/regression.py8
-rw-r--r--python/pyspark/mllib/stat/__init__.py4
-rw-r--r--python/pyspark/mllib/stat/distribution.py3
-rw-r--r--python/pyspark/mllib/tree.py6
7 files changed, 34 insertions, 8 deletions
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index c3217620e3..6449800d9c 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -19,7 +19,7 @@
Python bindings for MLlib.
"""
-# MLlib currently needs and NumPy 1.4+, so complain if lower
+# MLlib currently needs NumPy 1.4+, so complain if lower
import numpy
if numpy.version.version < '1.4':
diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py
index f6b97abb17..949db5705a 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -152,7 +152,7 @@ class GaussianMixtureModel(object):
class GaussianMixture(object):
"""
- Estimate model parameters with the expectation-maximization algorithm.
+ Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm.
:param data: RDD of data points
:param k: Number of components
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index 3c5ee66cd8..621591c26b 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -134,3 +134,20 @@ class JavaModelWrapper(object):
def call(self, name, *a):
"""Call method of java_model"""
return callJavaFunc(self._sc, getattr(self._java_model, name), *a)
+
+
+def inherit_doc(cls):
+ """
+ A decorator that makes a class inherit documentation from its parents.
+ """
+ for name, func in vars(cls).items():
+ # only inherit docstring for public functions
+ if name.startswith("_"):
+ continue
+ if not func.__doc__:
+ for parent in cls.__bases__:
+ parent_func = getattr(parent, name, None)
+ if parent_func and getattr(parent_func, "__doc__", None):
+ func.__doc__ = parent_func.__doc__
+ break
+ return cls
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 210060140f..21751cc68f 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -18,7 +18,7 @@
import numpy as np
from numpy import array
-from pyspark.mllib.common import callMLlibFunc
+from pyspark.mllib.common import callMLlibFunc, inherit_doc
from pyspark.mllib.linalg import SparseVector, _convert_to_vector
__all__ = ['LabeledPoint', 'LinearModel', 'LinearRegressionModel', 'RidgeRegressionModel',
@@ -69,6 +69,7 @@ class LinearModel(object):
return "(weights=%s, intercept=%r)" % (self._coeff, self._intercept)
+@inherit_doc
class LinearRegressionModelBase(LinearModel):
"""A linear regression model.
@@ -89,6 +90,7 @@ class LinearRegressionModelBase(LinearModel):
return self.weights.dot(x) + self.intercept
+@inherit_doc
class LinearRegressionModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit.
@@ -162,7 +164,7 @@ class LinearRegressionWithSGD(object):
@param intercept: Boolean parameter which indicates the use
or not of the augmented representation for
training data (i.e. whether bias features
- are activated or not).
+ are activated or not). (default: False)
"""
def train(rdd, i):
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -172,6 +174,7 @@ class LinearRegressionWithSGD(object):
return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)
+@inherit_doc
class LassoModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit with an
@@ -218,6 +221,7 @@ class LassoWithSGD(object):
return _regression_train_wrapper(train, LassoModel, data, initialWeights)
+@inherit_doc
class RidgeRegressionModel(LinearRegressionModelBase):
"""A linear regression model derived from a least-squares fit with an
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
index b686d955a0..e3e128513e 100644
--- a/python/pyspark/mllib/stat/__init__.py
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -21,5 +21,7 @@ Python package for statistical functions in MLlib.
from pyspark.mllib.stat._statistics import *
from pyspark.mllib.stat.distribution import MultivariateGaussian
+from pyspark.mllib.stat.test import ChiSqTestResult
-__all__ = ["Statistics", "MultivariateStatisticalSummary", "MultivariateGaussian"]
+__all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult",
+ "MultivariateGaussian"]
diff --git a/python/pyspark/mllib/stat/distribution.py b/python/pyspark/mllib/stat/distribution.py
index 07792e1532..46f7a1d2f2 100644
--- a/python/pyspark/mllib/stat/distribution.py
+++ b/python/pyspark/mllib/stat/distribution.py
@@ -22,7 +22,8 @@ __all__ = ['MultivariateGaussian']
class MultivariateGaussian(namedtuple('MultivariateGaussian', ['mu', 'sigma'])):
- """ Represents a (mu, sigma) tuple
+ """Represents a (mu, sigma) tuple
+
>>> m = MultivariateGaussian(Vectors.dense([11,12]),DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0)))
>>> (m.mu, m.sigma.toArray())
(DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]]))
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index aae48f2132..02d551b87d 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -20,12 +20,12 @@ from __future__ import absolute_import
import random
from pyspark import SparkContext, RDD
-from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
+from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel',
- 'RandomForest', 'GradientBoostedTrees']
+ 'RandomForest', 'GradientBoostedTreesModel', 'GradientBoostedTrees']
class TreeEnsembleModel(JavaModelWrapper):
@@ -216,6 +216,7 @@ class DecisionTree(object):
impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+@inherit_doc
class RandomForestModel(TreeEnsembleModel):
"""
.. note:: Experimental
@@ -381,6 +382,7 @@ class RandomForest(object):
featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
+@inherit_doc
class GradientBoostedTreesModel(TreeEnsembleModel):
"""
.. note:: Experimental