[SPARK-15136][PYSPARK][DOC] Fix links to sphinx style and add a default param doc note

## What changes were proposed in this pull request? PyDoc links in ml are in non-standard format. Switch to standard sphinx link format for better formatted documentation. Also add a note about default value in one place. Copy some extended docs from scala for GBT ## How was this patch tested? Built docs locally. Author: Holden Karau <holden@us.ibm.com> Closes #12918 from holdenk/SPARK-15137-linkify-pyspark-ml-classification.
author: Holden Karau <holden@us.ibm.com> 2016-05-09 09:11:17 +0100
committer: Sean Owen <sowen@cloudera.com> 2016-05-09 09:11:17 +0100
commit: 12fe2ecd1998a8b01667aa1ab910a604b2aec4c8 (patch)
tree: 39813ff79a12b15e95541e6b68077704eadbbd8f /python/pyspark/ml/classification.py
parent: 68abc1b4e9afbb6c2a87689221a46b835dded102 (diff)
download: spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.tar.gz
spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.tar.bz2
spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.zip
1 files changed, 20 insertions, 8 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index f032963334..c26c2d7fa5 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -353,7 +353,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary):
         Returns the receiver operating characteristic (ROC) curve,
         which is an Dataframe having two fields (FPR, TPR) with
         (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
-        Reference: http://en.wikipedia.org/wiki/Receiver_operating_characteristic
+
+        .. seealso:: `Wikipedia reference \
+        <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 
         Note: This ignores instance weights (setting all to 1.0) from
         `LogisticRegression.weightCol`. This will change in later Spark
@@ -489,7 +491,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
                              TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable,
                              JavaMLReadable):
     """
-    `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
+    `Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
     learning algorithm for classification.
     It supports both binary and multiclass labels, as well as both continuous and categorical
     features.
@@ -616,7 +618,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
                              RandomForestParams, TreeClassifierParams, HasCheckpointInterval,
                              JavaMLWritable, JavaMLReadable):
     """
-    `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
+    `Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
     learning algorithm for classification.
     It supports both binary and multiclass labels, as well as both continuous and categorical
     features.
@@ -734,11 +736,21 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
                     GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable,
                     JavaMLReadable):
     """
-    `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
+    `Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
     learning algorithm for classification.
     It supports binary labels, as well as both continuous and categorical features.
     Note: Multiclass labels are not currently supported.
 
+    The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
+
+    Notes on Gradient Boosting vs. TreeBoost:
+    - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
+    - Both algorithms learn tree ensembles by minimizing loss functions.
+    - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
+    based on the loss function, whereas the original gradient boosting method does not.
+    - We expect to implement TreeBoost in the future:
+    `SPARK-4240 <https://issues.apache.org/jira/browse/SPARK-4240>`_
+
     >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> from pyspark.ml.feature import StringIndexer
@@ -863,12 +875,12 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
                  HasRawPredictionCol, JavaMLWritable, JavaMLReadable):
     """
     Naive Bayes Classifiers.
-    It supports both Multinomial and Bernoulli NB. Multinomial NB
-    (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`)
+    It supports both Multinomial and Bernoulli NB. `Multinomial NB
+    <http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html>`_
     can handle finitely supported discrete data. For example, by converting documents into
     TF-IDF vectors, it can be used for document classification. By making every vector a
-    binary (0/1) data, it can also be used as Bernoulli NB
-    (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`).
+    binary (0/1) data, it can also be used as `Bernoulli NB
+    <http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html>`_.
     The input feature values must be nonnegative.
 
     >>> from pyspark.sql import Row
author	Holden Karau <holden@us.ibm.com>	2016-05-09 09:11:17 +0100
committer	Sean Owen <sowen@cloudera.com>	2016-05-09 09:11:17 +0100
commit	12fe2ecd1998a8b01667aa1ab910a604b2aec4c8 (patch)
tree	39813ff79a12b15e95541e6b68077704eadbbd8f /python/pyspark/ml/classification.py
parent	68abc1b4e9afbb6c2a87689221a46b835dded102 (diff)
download	spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.tar.gz spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.tar.bz2 spark-12fe2ecd1998a8b01667aa1ab910a604b2aec4c8.zip