aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorHolden Karau <holden@us.ibm.com>2016-05-09 09:11:17 +0100
committerSean Owen <sowen@cloudera.com>2016-05-09 09:11:17 +0100
commit12fe2ecd1998a8b01667aa1ab910a604b2aec4c8 (patch)
tree39813ff79a12b15e95541e6b68077704eadbbd8f /python
parent68abc1b4e9afbb6c2a87689221a46b835dded102 (diff)
downloadspark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.tar.gz
spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.tar.bz2
spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.zip
[SPARK-15136][PYSPARK][DOC] Fix links to sphinx style and add a default param doc note
## What changes were proposed in this pull request? PyDoc links in ml are in non-standard format. Switch to standard sphinx link format for better formatted documentation. Also add a note about default value in one place. Copy some extended docs from scala for GBT ## How was this patch tested? Built docs locally. Author: Holden Karau <holden@us.ibm.com> Closes #12918 from holdenk/SPARK-15137-linkify-pyspark-ml-classification.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/classification.py28
-rwxr-xr-xpython/pyspark/ml/feature.py13
-rw-r--r--python/pyspark/ml/recommendation.py4
-rw-r--r--python/pyspark/ml/regression.py14
-rw-r--r--python/pyspark/rdd.py6
5 files changed, 40 insertions, 25 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index f032963334..c26c2d7fa5 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -353,7 +353,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
Returns the receiver operating characteristic (ROC) curve,
which is an Dataframe having two fields (FPR, TPR) with
(0.0, 0.0) prepended and (1.0, 1.0) appended to it.
- Reference: http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+
+ .. seealso:: `Wikipedia reference \
+ <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
Note: This ignores instance weights (setting all to 1.0) from
`LogisticRegression.weightCol`. This will change in later Spark
@@ -489,7 +491,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable,
JavaMLReadable):
"""
- `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
+ `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical
features.
@@ -616,7 +618,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
RandomForestParams, TreeClassifierParams, HasCheckpointInterval,
JavaMLWritable, JavaMLReadable):
"""
- `http://en.wikipedia.org/wiki/Random_forest Random Forest`
+ `Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical
features.
@@ -734,11 +736,21 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable,
JavaMLReadable):
"""
- `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
+ `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
learning algorithm for classification.
It supports binary labels, as well as both continuous and categorical features.
Note: Multiclass labels are not currently supported.
+ The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
+
+ Notes on Gradient Boosting vs. TreeBoost:
+ - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
+ - Both algorithms learn tree ensembles by minimizing loss functions.
+ - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
+ based on the loss function, whereas the original gradient boosting method does not.
+ - We expect to implement TreeBoost in the future:
+ `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_
+
>>> from numpy import allclose
>>> from pyspark.mllib.linalg import Vectors
>>> from pyspark.ml.feature import StringIndexer
@@ -863,12 +875,12 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
HasRawPredictionCol, JavaMLWritable, JavaMLReadable):
"""
Naive Bayes Classifiers.
- It supports both Multinomial and Bernoulli NB. Multinomial NB
- (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`)
+ It supports both Multinomial and Bernoulli NB. `Multinomial NB
+ <http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html>`_
can handle finitely supported discrete data. For example, by converting documents into
TF-IDF vectors, it can be used for document classification. By making every vector a
- binary (0/1) data, it can also be used as Bernoulli NB
- (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`).
+ binary (0/1) data, it can also be used as `Bernoulli NB
+ <http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html>`_.
The input feature values must be nonnegative.
>>> from pyspark.sql import Row
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index d2989fa4cd..606a6e7c22 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -377,8 +377,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
The return vector is scaled such that the transform matrix is
unitary (aka scaled DCT-II).
- More information on
- `https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`.
+ .. seealso:: `More information on Wikipedia \
+ <https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia>`_.
>>> from pyspark.mllib.linalg import Vectors
>>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
@@ -1108,8 +1108,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead
"""
.. note:: Experimental
- Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
- which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an
+ Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion
+ <http://en.wikipedia.org/wiki/Polynomial_expansion>`_, "In mathematics, an
expansion of a product of sums expresses it as a sum of products by using the fact that
multiplication distributes over addition". Take a 2-variable feature vector as an example:
`(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
@@ -2432,9 +2432,8 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM
Implements the transforms required for fitting a dataset against an
R model formula. Currently we support a limited subset of the R
- operators, including '~', '.', ':', '+', and '-'. Also see the R formula
- docs:
- http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ operators, including '~', '.', ':', '+', and '-'. Also see the `R formula docs
+ <http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html>`_.
>>> df = sqlContext.createDataFrame([
... (1.0, 1.0, "a"),
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index db02684262..d7cb658465 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -54,8 +54,8 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
and update the products based on these messages.
For implicit preference data, the algorithm used is based on
- "Collaborative Filtering for Implicit Feedback Datasets", available
- at `http://dx.doi.org/10.1109/ICDM.2008.22`, adapted for the blocked
+ `"Collaborative Filtering for Implicit Feedback Datasets",
+ <http://dx.doi.org/10.1109/ICDM.2008.22>`_, adapted for the blocked
approach used here.
Essentially instead of finding the low-rank approximations to the
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 04f566dfec..a2300fa49c 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -229,7 +229,9 @@ class LinearRegressionSummary(JavaWrapper):
"""
Returns the explained variance regression score.
explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
- Reference: http://en.wikipedia.org/wiki/Explained_variation
+
+ .. seealso:: `Wikipedia explain variation \
+ <http://en.wikipedia.org/wiki/Explained_variation>`_
Note: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark
@@ -283,7 +285,9 @@ class LinearRegressionSummary(JavaWrapper):
def r2(self):
"""
Returns R^2^, the coefficient of determination.
- Reference: http://en.wikipedia.org/wiki/Coefficient_of_determination
+
+ .. seealso:: `Wikipedia coefficient of determination \
+ <http://en.wikipedia.org/wiki/Coefficient_of_determination>`
Note: This ignores instance weights (setting all to 1.0) from
`LinearRegression.weightCol`. This will change in later Spark
@@ -627,7 +631,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval,
HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol):
"""
- `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
+ `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
learning algorithm for regression.
It supports both continuous and categorical features.
@@ -782,7 +786,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
RandomForestParams, TreeRegressorParams, HasCheckpointInterval,
JavaMLWritable, JavaMLReadable):
"""
- `http://en.wikipedia.org/wiki/Random_forest Random Forest`
+ `Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
learning algorithm for regression.
It supports both continuous and categorical features.
@@ -890,7 +894,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable,
JavaMLReadable):
"""
- `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
+ `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
learning algorithm for regression.
It supports both continuous and categorical features.
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 8978f028c5..411e377a56 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -2274,9 +2274,9 @@ class RDD(object):
Return approximate number of distinct elements in the RDD.
The algorithm used is based on streamlib's implementation of
- "HyperLogLog in Practice: Algorithmic Engineering of a State
- of The Art Cardinality Estimation Algorithm", available
- <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
+ `"HyperLogLog in Practice: Algorithmic Engineering of a State
+ of The Art Cardinality Estimation Algorithm", available here
+ <http://dx.doi.org/10.1145/2452376.2452456>`_.
:param relativeSD: Relative accuracy. Smaller values create
counters that require more space.