diff options
author | Joseph K. Bradley <joseph@databricks.com> | 2015-02-20 02:31:32 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-02-20 02:31:39 -0800 |
commit | 8c12f311444008fedc610d866f2535233027bced (patch) | |
tree | 6297ca100a9e2d3203a8d236ed2a0caf2a3e3812 /python | |
parent | 0382dcc0a94f8e619fd11ec2cc0b18459a690c2b (diff) | |
download | spark-8c12f311444008fedc610d866f2535233027bced.tar.gz spark-8c12f311444008fedc610d866f2535233027bced.tar.bz2 spark-8c12f311444008fedc610d866f2535233027bced.zip |
[SPARK-5867] [SPARK-5892] [doc] [ml] [mllib] Doc cleanups for 1.3 release
For SPARK-5867:
* The spark.ml programming guide needs to be updated to use the new SQL DataFrame API instead of the old SchemaRDD API.
* It should also include Python examples now.
For SPARK-5892:
* Fix Python docs
* Various other cleanups
BTW, I accidentally merged this with master. If you want to compile it on your own, use this branch which is based on spark/branch-1.3 and cherry-picks the commits from this PR: [https://github.com/jkbradley/spark/tree/doc-review-1.3-check]
CC: mengxr (ML), davies (Python docs)
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #4675 from jkbradley/doc-review-1.3 and squashes the following commits:
f191bb0 [Joseph K. Bradley] small cleanups
e786efa [Joseph K. Bradley] small doc corrections
6b1ab4a [Joseph K. Bradley] fixed python lint test
946affa [Joseph K. Bradley] Added sample data for ml.MovieLensALS example. Changed spark.ml Java examples to use DataFrames API instead of sql()
da81558 [Joseph K. Bradley] Merge remote-tracking branch 'upstream/master' into doc-review-1.3
629dbf5 [Joseph K. Bradley] Updated based on code review: * made new page for old migration guides * small fixes * moved inherit_doc in python
b9df7c4 [Joseph K. Bradley] Small cleanups: toDF to toDF(), adding s for string interpolation
34b067f [Joseph K. Bradley] small doc correction
da16aef [Joseph K. Bradley] Fixed python mllib docs
8cce91c [Joseph K. Bradley] GMM: removed old imports, added some doc
695f3f6 [Joseph K. Bradley] partly done trying to fix inherit_doc for class hierarchies in python docs
a72c018 [Joseph K. Bradley] made ChiSqTestResult appear in python docs
b05a80d [Joseph K. Bradley] organize imports. doc cleanups
e572827 [Joseph K. Bradley] updated programming guide for ml and mllib
(cherry picked from commit 4a17eedb16343413e5b6f8bb58c6da8952ee7ab6)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
Diffstat (limited to 'python')
-rw-r--r-- | python/docs/pyspark.mllib.rst | 11 | ||||
-rw-r--r-- | python/pyspark/ml/classification.py | 3 | ||||
-rw-r--r-- | python/pyspark/ml/feature.py | 3 | ||||
-rw-r--r-- | python/pyspark/ml/pipeline.py | 3 | ||||
-rw-r--r-- | python/pyspark/ml/util.py | 14 | ||||
-rw-r--r-- | python/pyspark/ml/wrapper.py | 2 | ||||
-rw-r--r-- | python/pyspark/mllib/__init__.py | 2 | ||||
-rw-r--r-- | python/pyspark/mllib/clustering.py | 2 | ||||
-rw-r--r-- | python/pyspark/mllib/common.py | 17 | ||||
-rw-r--r-- | python/pyspark/mllib/regression.py | 8 | ||||
-rw-r--r-- | python/pyspark/mllib/stat/__init__.py | 4 | ||||
-rw-r--r-- | python/pyspark/mllib/stat/distribution.py | 3 | ||||
-rw-r--r-- | python/pyspark/mllib/tree.py | 6 |
13 files changed, 44 insertions, 34 deletions
diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst index 21f66ca344..b706c5e376 100644 --- a/python/docs/pyspark.mllib.rst +++ b/python/docs/pyspark.mllib.rst @@ -7,7 +7,7 @@ pyspark.mllib.classification module .. automodule:: pyspark.mllib.classification :members: :undoc-members: - :show-inheritance: + :inherited-members: pyspark.mllib.clustering module ------------------------------- @@ -15,7 +15,6 @@ pyspark.mllib.clustering module .. automodule:: pyspark.mllib.clustering :members: :undoc-members: - :show-inheritance: pyspark.mllib.feature module ------------------------------- @@ -39,7 +38,6 @@ pyspark.mllib.random module .. automodule:: pyspark.mllib.random :members: :undoc-members: - :show-inheritance: pyspark.mllib.recommendation module ----------------------------------- @@ -47,7 +45,6 @@ pyspark.mllib.recommendation module .. automodule:: pyspark.mllib.recommendation :members: :undoc-members: - :show-inheritance: pyspark.mllib.regression module ------------------------------- @@ -55,7 +52,7 @@ pyspark.mllib.regression module .. automodule:: pyspark.mllib.regression :members: :undoc-members: - :show-inheritance: + :inherited-members: pyspark.mllib.stat module ------------------------- @@ -63,7 +60,6 @@ pyspark.mllib.stat module .. automodule:: pyspark.mllib.stat :members: :undoc-members: - :show-inheritance: pyspark.mllib.tree module ------------------------- @@ -71,7 +67,7 @@ pyspark.mllib.tree module .. automodule:: pyspark.mllib.tree :members: :undoc-members: - :show-inheritance: + :inherited-members: pyspark.mllib.util module ------------------------- @@ -79,4 +75,3 @@ pyspark.mllib.util module .. automodule:: pyspark.mllib.util :members: :undoc-members: - :show-inheritance: diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b6de7493d7..4ff7463498 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -15,10 +15,11 @@ # limitations under the License. # -from pyspark.ml.util import inherit_doc, keyword_only +from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,\ HasRegParam +from pyspark.mllib.common import inherit_doc __all__ = ['LogisticRegression', 'LogisticRegressionModel'] diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index f1ddbb478d..433b4fb5d2 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -16,8 +16,9 @@ # from pyspark.ml.param.shared import HasInputCol, HasOutputCol, HasNumFeatures -from pyspark.ml.util import inherit_doc, keyword_only +from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaTransformer +from pyspark.mllib.common import inherit_doc __all__ = ['Tokenizer', 'HashingTF'] diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 18d8a58f35..5233c5801e 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -18,7 +18,8 @@ from abc import ABCMeta, abstractmethod from pyspark.ml.param import Param, Params -from pyspark.ml.util import inherit_doc, keyword_only +from pyspark.ml.util import keyword_only +from pyspark.mllib.common import inherit_doc __all__ = ['Estimator', 'Transformer', 'Pipeline', 'PipelineModel'] diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 81d3f0882b..6f7f39c40e 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -19,20 +19,6 @@ from functools import wraps import uuid -def inherit_doc(cls): - for name, func in vars(cls).items(): - # only inherit docstring for public functions - if name.startswith("_"): - continue - if not func.__doc__: - for parent in cls.__bases__: - parent_func = getattr(parent, name, None) - if parent_func and getattr(parent_func, "__doc__", None): - func.__doc__ = parent_func.__doc__ - break - return cls - - def keyword_only(func): """ A decorator that forces keyword arguments in the wrapped method diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 9e12ddc3d9..4bae96f678 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -21,7 +21,7 @@ from pyspark import SparkContext from pyspark.sql import DataFrame from pyspark.ml.param import Params from pyspark.ml.pipeline import Estimator, Transformer -from pyspark.ml.util import inherit_doc +from pyspark.mllib.common import inherit_doc def _jvm(): diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py index c3217620e3..6449800d9c 100644 --- a/python/pyspark/mllib/__init__.py +++ b/python/pyspark/mllib/__init__.py @@ -19,7 +19,7 @@ Python bindings for MLlib. """ -# MLlib currently needs and NumPy 1.4+, so complain if lower +# MLlib currently needs NumPy 1.4+, so complain if lower import numpy if numpy.version.version < '1.4': diff --git a/python/pyspark/mllib/clustering.py b/python/pyspark/mllib/clustering.py index f6b97abb17..949db5705a 100644 --- a/python/pyspark/mllib/clustering.py +++ b/python/pyspark/mllib/clustering.py @@ -152,7 +152,7 @@ class GaussianMixtureModel(object): class GaussianMixture(object): """ - Estimate model parameters with the expectation-maximization algorithm. + Learning algorithm for Gaussian Mixtures using the expectation-maximization algorithm. :param data: RDD of data points :param k: Number of components diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py index 3c5ee66cd8..621591c26b 100644 --- a/python/pyspark/mllib/common.py +++ b/python/pyspark/mllib/common.py @@ -134,3 +134,20 @@ class JavaModelWrapper(object): def call(self, name, *a): """Call method of java_model""" return callJavaFunc(self._sc, getattr(self._java_model, name), *a) + + +def inherit_doc(cls): + """ + A decorator that makes a class inherit documentation from its parents. + """ + for name, func in vars(cls).items(): + # only inherit docstring for public functions + if name.startswith("_"): + continue + if not func.__doc__: + for parent in cls.__bases__: + parent_func = getattr(parent, name, None) + if parent_func and getattr(parent_func, "__doc__", None): + func.__doc__ = parent_func.__doc__ + break + return cls diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 210060140f..21751cc68f 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -18,7 +18,7 @@ import numpy as np from numpy import array -from pyspark.mllib.common import callMLlibFunc +from pyspark.mllib.common import callMLlibFunc, inherit_doc from pyspark.mllib.linalg import SparseVector, _convert_to_vector __all__ = ['LabeledPoint', 'LinearModel', 'LinearRegressionModel', 'RidgeRegressionModel', @@ -69,6 +69,7 @@ class LinearModel(object): return "(weights=%s, intercept=%r)" % (self._coeff, self._intercept) +@inherit_doc class LinearRegressionModelBase(LinearModel): """A linear regression model. @@ -89,6 +90,7 @@ class LinearRegressionModelBase(LinearModel): return self.weights.dot(x) + self.intercept +@inherit_doc class LinearRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit. @@ -162,7 +164,7 @@ class LinearRegressionWithSGD(object): @param intercept: Boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias features - are activated or not). + are activated or not). (default: False) """ def train(rdd, i): return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations), @@ -172,6 +174,7 @@ class LinearRegressionWithSGD(object): return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights) +@inherit_doc class LassoModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an @@ -218,6 +221,7 @@ class LassoWithSGD(object): return _regression_train_wrapper(train, LassoModel, data, initialWeights) +@inherit_doc class RidgeRegressionModel(LinearRegressionModelBase): """A linear regression model derived from a least-squares fit with an diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py index b686d955a0..e3e128513e 100644 --- a/python/pyspark/mllib/stat/__init__.py +++ b/python/pyspark/mllib/stat/__init__.py @@ -21,5 +21,7 @@ Python package for statistical functions in MLlib. from pyspark.mllib.stat._statistics import * from pyspark.mllib.stat.distribution import MultivariateGaussian +from pyspark.mllib.stat.test import ChiSqTestResult -__all__ = ["Statistics", "MultivariateStatisticalSummary", "MultivariateGaussian"] +__all__ = ["Statistics", "MultivariateStatisticalSummary", "ChiSqTestResult", + "MultivariateGaussian"] diff --git a/python/pyspark/mllib/stat/distribution.py b/python/pyspark/mllib/stat/distribution.py index 07792e1532..46f7a1d2f2 100644 --- a/python/pyspark/mllib/stat/distribution.py +++ b/python/pyspark/mllib/stat/distribution.py @@ -22,7 +22,8 @@ __all__ = ['MultivariateGaussian'] class MultivariateGaussian(namedtuple('MultivariateGaussian', ['mu', 'sigma'])): - """ Represents a (mu, sigma) tuple + """Represents a (mu, sigma) tuple + >>> m = MultivariateGaussian(Vectors.dense([11,12]),DenseMatrix(2, 2, (1.0, 3.0, 5.0, 2.0))) >>> (m.mu, m.sigma.toArray()) (DenseVector([11.0, 12.0]), array([[ 1., 5.],[ 3., 2.]])) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index aae48f2132..02d551b87d 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -20,12 +20,12 @@ from __future__ import absolute_import import random from pyspark import SparkContext, RDD -from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper +from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint __all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', - 'RandomForest', 'GradientBoostedTrees'] + 'RandomForest', 'GradientBoostedTreesModel', 'GradientBoostedTrees'] class TreeEnsembleModel(JavaModelWrapper): @@ -216,6 +216,7 @@ class DecisionTree(object): impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) +@inherit_doc class RandomForestModel(TreeEnsembleModel): """ .. note:: Experimental @@ -381,6 +382,7 @@ class RandomForest(object): featureSubsetStrategy, impurity, maxDepth, maxBins, seed) +@inherit_doc class GradientBoostedTreesModel(TreeEnsembleModel): """ .. note:: Experimental |