From 12fe2ecd1998a8b01667aa1ab910a604b2aec4c8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 9 May 2016 09:11:17 +0100 Subject: [SPARK-15136][PYSPARK][DOC] Fix links to sphinx style and add a default param doc note ## What changes were proposed in this pull request? PyDoc links in ml are in non-standard format. Switch to standard sphinx link format for better formatted documentation. Also add a note about default value in one place. Copy some extended docs from scala for GBT ## How was this patch tested? Built docs locally. Author: Holden Karau Closes #12918 from holdenk/SPARK-15137-linkify-pyspark-ml-classification. --- python/pyspark/ml/classification.py | 28 ++++++++++++++++++++-------- python/pyspark/ml/feature.py | 13 ++++++------- python/pyspark/ml/recommendation.py | 4 ++-- python/pyspark/ml/regression.py | 14 +++++++++----- python/pyspark/rdd.py | 6 +++--- 5 files changed, 40 insertions(+), 25 deletions(-) (limited to 'python/pyspark') diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f032963334..c26c2d7fa5 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -353,7 +353,9 @@ class BinaryLogisticRegressionSummary(LogisticRegressionSummary): Returns the receiver operating characteristic (ROC) curve, which is an Dataframe having two fields (FPR, TPR) with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. - Reference: http://en.wikipedia.org/wiki/Receiver_operating_characteristic + + .. seealso:: `Wikipedia reference \ + `_ Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`. This will change in later Spark @@ -489,7 +491,7 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable): """ - `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree` + `Decision tree `_ learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical features. @@ -616,7 +618,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred RandomForestParams, TreeClassifierParams, HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ - `http://en.wikipedia.org/wiki/Random_forest Random Forest` + `Random Forest `_ learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical features. @@ -734,11 +736,21 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable): """ - `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)` + `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. It supports binary labels, as well as both continuous and categorical features. Note: Multiclass labels are not currently supported. + The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. + + Notes on Gradient Boosting vs. TreeBoost: + - This implementation is for Stochastic Gradient Boosting, not for TreeBoost. + - Both algorithms learn tree ensembles by minimizing loss functions. + - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes + based on the loss function, whereas the original gradient boosting method does not. + - We expect to implement TreeBoost in the future: + `SPARK-4240 `_ + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer @@ -863,12 +875,12 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H HasRawPredictionCol, JavaMLWritable, JavaMLReadable): """ Naive Bayes Classifiers. - It supports both Multinomial and Bernoulli NB. Multinomial NB - (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`) + It supports both Multinomial and Bernoulli NB. `Multinomial NB + `_ can handle finitely supported discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a - binary (0/1) data, it can also be used as Bernoulli NB - (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`). + binary (0/1) data, it can also be used as `Bernoulli NB + `_. The input feature values must be nonnegative. >>> from pyspark.sql import Row diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index d2989fa4cd..606a6e7c22 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -377,8 +377,8 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit The return vector is scaled such that the transform matrix is unitary (aka scaled DCT-II). - More information on - `https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`. + .. seealso:: `More information on Wikipedia \ + `_. >>> from pyspark.mllib.linalg import Vectors >>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"]) @@ -1108,8 +1108,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead """ .. note:: Experimental - Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion, - which is available at `http://en.wikipedia.org/wiki/Polynomial_expansion`, "In mathematics, an + Perform feature expansion in a polynomial space. As said in `wikipedia of Polynomial Expansion + `_, "In mathematics, an expansion of a product of sums expresses it as a sum of products by using the fact that multiplication distributes over addition". Take a 2-variable feature vector as an example: `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`. @@ -2432,9 +2432,8 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM Implements the transforms required for fitting a dataset against an R model formula. Currently we support a limited subset of the R - operators, including '~', '.', ':', '+', and '-'. Also see the R formula - docs: - http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html + operators, including '~', '.', ':', '+', and '-'. Also see the `R formula docs + `_. >>> df = sqlContext.createDataFrame([ ... (1.0, 1.0, "a"), diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index db02684262..d7cb658465 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -54,8 +54,8 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha and update the products based on these messages. For implicit preference data, the algorithm used is based on - "Collaborative Filtering for Implicit Feedback Datasets", available - at `http://dx.doi.org/10.1109/ICDM.2008.22`, adapted for the blocked + `"Collaborative Filtering for Implicit Feedback Datasets", + `_, adapted for the blocked approach used here. Essentially instead of finding the low-rank approximations to the diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 04f566dfec..a2300fa49c 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -229,7 +229,9 @@ class LinearRegressionSummary(JavaWrapper): """ Returns the explained variance regression score. explainedVariance = 1 - variance(y - \hat{y}) / variance(y) - Reference: http://en.wikipedia.org/wiki/Explained_variation + + .. seealso:: `Wikipedia explain variation \ + `_ Note: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark @@ -283,7 +285,9 @@ class LinearRegressionSummary(JavaWrapper): def r2(self): """ Returns R^2^, the coefficient of determination. - Reference: http://en.wikipedia.org/wiki/Coefficient_of_determination + + .. seealso:: `Wikipedia coefficient of determination \ + ` Note: This ignores instance weights (setting all to 1.0) from `LinearRegression.weightCol`. This will change in later Spark @@ -627,7 +631,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): """ - `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree` + `Decision tree `_ learning algorithm for regression. It supports both continuous and categorical features. @@ -782,7 +786,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi RandomForestParams, TreeRegressorParams, HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ - `http://en.wikipedia.org/wiki/Random_forest Random Forest` + `Random Forest `_ learning algorithm for regression. It supports both continuous and categorical features. @@ -890,7 +894,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable): """ - `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)` + `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. It supports both continuous and categorical features. diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 8978f028c5..411e377a56 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -2274,9 +2274,9 @@ class RDD(object): Return approximate number of distinct elements in the RDD. The algorithm used is based on streamlib's implementation of - "HyperLogLog in Practice: Algorithmic Engineering of a State - of The Art Cardinality Estimation Algorithm", available - here. + `"HyperLogLog in Practice: Algorithmic Engineering of a State + of The Art Cardinality Estimation Algorithm", available here + `_. :param relativeSD: Relative accuracy. Smaller values create counters that require more space. -- cgit v1.2.3