From d7af736b2cf6c392b87e7b45c2d2219ef06979eb Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Fri, 8 Apr 2016 20:15:44 -0700 Subject: [SPARK-14498][ML][PYTHON][SQL] Many cleanups to ML and ML-related docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What changes were proposed in this pull request? Cleanups to documentation. No changes to code. * GBT docs: Move Scala doc for private object GradientBoostedTrees to public docs for GBTClassifier,Regressor * GLM regParam: needs doc saying it is for L2 only * TrainValidationSplitModel: add .. versionadded:: 2.0.0 * Rename “_transformer_params_from_java” to “_transfer_params_from_java” * LogReg Summary classes: “probability” col should not say “calibrated” * LR summaries: coefficientStandardErrors —> document that intercept stderr comes last. Same for t,p-values * approxCountDistinct: Document meaning of “rsd" argument. * LDA: note which params are for online LDA only ## How was this patch tested? Doc build Author: Joseph K. Bradley Closes #12266 from jkbradley/ml-doc-cleanups. --- .../spark/ml/classification/GBTClassifier.scala | 10 +++++++ .../ml/classification/LogisticRegression.scala | 12 ++++---- .../scala/org/apache/spark/ml/clustering/LDA.scala | 34 +++++++++++++++------- .../apache/spark/ml/regression/GBTRegressor.scala | 12 ++++++++ .../regression/GeneralizedLinearRegression.scala | 17 +++++++++-- .../spark/ml/regression/LinearRegression.scala | 19 ++++++++---- .../spark/ml/tree/impl/GradientBoostedTrees.scala | 16 ---------- python/pyspark/ml/classification.py | 2 +- python/pyspark/ml/regression.py | 9 ++++++ python/pyspark/ml/tuning.py | 2 ++ python/pyspark/ml/wrapper.py | 2 +- .../scala/org/apache/spark/sql/functions.scala | 4 +++ 12 files changed, 97 insertions(+), 42 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index bee90fb3a5..a2150fbcc3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -42,6 +42,16 @@ import org.apache.spark.sql.functions._ * learning algorithm for classification. * It supports binary labels, as well as both continuous and categorical features. * Note: Multiclass labels are not currently supported. + * + * The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. + * + * Notes on Gradient Boosting vs. TreeBoost: + * - This implementation is for Stochastic Gradient Boosting, not for TreeBoost. + * - Both algorithms learn tree ensembles by minimizing loss functions. + * - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes + * based on the loss function, whereas the original gradient boosting method does not. + * - We expect to implement TreeBoost in the future: + * [https://issues.apache.org/jira/browse/SPARK-4240] */ @Since("1.4.0") @Experimental diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 37182928cc..268c3e32c3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -774,10 +774,10 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary */ sealed trait LogisticRegressionSummary extends Serializable { - /** Dataframe outputted by the model's `transform` method. */ + /** Dataframe output by the model's `transform` method. */ def predictions: DataFrame - /** Field in "predictions" which gives the calibrated probability of each class as a vector. */ + /** Field in "predictions" which gives the probability of each class as a vector. */ def probabilityCol: String /** Field in "predictions" which gives the true label of each instance (if available). */ @@ -792,8 +792,8 @@ sealed trait LogisticRegressionSummary extends Serializable { * :: Experimental :: * Logistic regression training results. * - * @param predictions dataframe outputted by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the calibrated probability of + * @param predictions dataframe output by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the probability of * each class as a vector. * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. @@ -816,8 +816,8 @@ class BinaryLogisticRegressionTrainingSummary private[classification] ( * :: Experimental :: * Binary Logistic regression results for a given model. * - * @param predictions dataframe outputted by the model's `transform` method. - * @param probabilityCol field in "predictions" which gives the calibrated probability of + * @param predictions dataframe output by the model's `transform` method. + * @param probabilityCol field in "predictions" which gives the probability of * each class as a vector. * @param labelCol field in "predictions" which gives the true label of each instance. * @param featuresCol field in "predictions" which gives the features of each instance as a vector. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 727b724708..89a7a4ccf6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -190,6 +190,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM def getTopicDistributionCol: String = $(topicDistributionCol) /** + * For Online optimizer only: [[optimizer]] = "online". + * * A (positive) learning parameter that downweights early iterations. Larger values make early * iterations count less. * This is called "tau0" in the Online LDA paper (Hoffman et al., 2010) @@ -198,8 +200,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * @group expertParam */ @Since("1.6.0") - final val learningOffset = new DoubleParam(this, "learningOffset", "A (positive) learning" + - " parameter that downweights early iterations. Larger values make early iterations count less.", + final val learningOffset = new DoubleParam(this, "learningOffset", "(For online optimizer)" + + " A (positive) learning parameter that downweights early iterations. Larger values make early" + + " iterations count less.", ParamValidators.gt(0)) /** @group expertGetParam */ @@ -207,6 +210,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM def getLearningOffset: Double = $(learningOffset) /** + * For Online optimizer only: [[optimizer]] = "online". + * * Learning rate, set as an exponential decay rate. * This should be between (0.5, 1.0] to guarantee asymptotic convergence. * This is called "kappa" in the Online LDA paper (Hoffman et al., 2010). @@ -215,15 +220,17 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * @group expertParam */ @Since("1.6.0") - final val learningDecay = new DoubleParam(this, "learningDecay", "Learning rate, set as an" + - " exponential decay rate. This should be between (0.5, 1.0] to guarantee asymptotic" + - " convergence.", ParamValidators.gt(0)) + final val learningDecay = new DoubleParam(this, "learningDecay", "(For online optimizer)" + + " Learning rate, set as an exponential decay rate. This should be between (0.5, 1.0] to" + + " guarantee asymptotic convergence.", ParamValidators.gt(0)) /** @group expertGetParam */ @Since("1.6.0") def getLearningDecay: Double = $(learningDecay) /** + * For Online optimizer only: [[optimizer]] = "online". + * * Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent, * in range (0, 1]. * @@ -239,8 +246,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM * @group param */ @Since("1.6.0") - final val subsamplingRate = new DoubleParam(this, "subsamplingRate", "Fraction of the corpus" + - " to be sampled and used in each iteration of mini-batch gradient descent, in range (0, 1].", + final val subsamplingRate = new DoubleParam(this, "subsamplingRate", "(For online optimizer)" + + " Fraction of the corpus to be sampled and used in each iteration of mini-batch" + + " gradient descent, in range (0, 1].", ParamValidators.inRange(0.0, 1.0, lowerInclusive = false, upperInclusive = true)) /** @group getParam */ @@ -248,6 +256,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM def getSubsamplingRate: Double = $(subsamplingRate) /** + * For Online optimizer only (currently): [[optimizer]] = "online". + * * Indicates whether the docConcentration (Dirichlet parameter for * document-topic distribution) will be optimized during training. * Setting this to true will make the model more expressive and fit the training data better. @@ -257,15 +267,17 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM */ @Since("1.6.0") final val optimizeDocConcentration = new BooleanParam(this, "optimizeDocConcentration", - "Indicates whether the docConcentration (Dirichlet parameter for document-topic" + - " distribution) will be optimized during training.") + "(For online optimizer only, currently) Indicates whether the docConcentration" + + " (Dirichlet parameter for document-topic distribution) will be optimized during training.") /** @group expertGetParam */ @Since("1.6.0") def getOptimizeDocConcentration: Boolean = $(optimizeDocConcentration) /** - * For EM optimizer, if using checkpointing, this indicates whether to keep the last + * For EM optimizer only: [[optimizer]] = "em". + * + * If using checkpointing, this indicates whether to keep the last * checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can * cause failures if a data partition is lost, so set this bit with care. * Note that checkpoints will be cleaned up via reference counting, regardless. @@ -279,7 +291,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM */ @Since("2.0.0") final val keepLastCheckpoint = new BooleanParam(this, "keepLastCheckpoint", - "For EM optimizer, if using checkpointing, this indicates whether to keep the last" + + "(For EM optimizer) If using checkpointing, this indicates whether to keep the last" + " checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can" + " cause failures if a data partition is lost, so set this bit with care.") diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index cef7c643d7..8eb2984f7b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -41,6 +41,18 @@ import org.apache.spark.sql.functions._ * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]] * learning algorithm for regression. * It supports both continuous and categorical features. + * + * The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999. + * + * Notes on Gradient Boosting vs. TreeBoost: + * - This implementation is for Stochastic Gradient Boosting, not for TreeBoost. + * - Both algorithms learn tree ensembles by minimizing loss functions. + * - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes + * based on the loss function, whereas the original gradient boosting method does not. + * - When the loss is SquaredError, these methods give the same result, but they could differ + * for other loss functions. + * - We expect to implement TreeBoost in the future: + * [https://issues.apache.org/jira/browse/SPARK-4240] */ @Since("1.4.0") @Experimental diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index a40d3731cb..05bf64591b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -165,7 +165,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val setDefault(tol -> 1E-6) /** - * Sets the regularization parameter. + * Sets the regularization parameter for L2 regularization. + * The regularization term is + * {{{ + * 0.5 * regParam * L2norm(coefficients)^2 + * }}} * Default is 0.0. * @group setParam */ @@ -772,7 +776,7 @@ object GeneralizedLinearRegressionModel extends MLReadable[GeneralizedLinearRegr * :: Experimental :: * Summarizing Generalized Linear regression Fits. * - * @param predictions predictions outputted by the model's `transform` method + * @param predictions predictions output by the model's `transform` method * @param predictionCol field in "predictions" which gives the prediction value of each instance * @param model the model that should be summarized * @param diagInvAtWA diagonal of matrix (A^T * W * A)^-1 in the last iteration @@ -933,6 +937,9 @@ class GeneralizedLinearRegressionSummary private[regression] ( /** * Standard error of estimated coefficients and intercept. + * + * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, + * then the last element returned corresponds to the intercept. */ @Since("2.0.0") lazy val coefficientStandardErrors: Array[Double] = { @@ -941,6 +948,9 @@ class GeneralizedLinearRegressionSummary private[regression] ( /** * T-statistic of estimated coefficients and intercept. + * + * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, + * then the last element returned corresponds to the intercept. */ @Since("2.0.0") lazy val tValues: Array[Double] = { @@ -954,6 +964,9 @@ class GeneralizedLinearRegressionSummary private[regression] ( /** * Two-sided p-value of estimated coefficients and intercept. + * + * If [[GeneralizedLinearRegression.fitIntercept]] is set to true, + * then the last element returned corresponds to the intercept. */ @Since("2.0.0") lazy val pValues: Array[Double] = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 9619e72a45..aacff4ea47 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -513,7 +513,7 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] { * Linear regression training results. Currently, the training summary ignores the * training weights except for the objective trace. * - * @param predictions predictions outputted by the model's `transform` method. + * @param predictions predictions output by the model's `transform` method. * @param objectiveHistory objective function (scaled loss + regularization) at each iteration. */ @Since("1.5.0") @@ -549,7 +549,7 @@ class LinearRegressionTrainingSummary private[regression] ( * :: Experimental :: * Linear regression results evaluated on a dataset. * - * @param predictions predictions outputted by the model's `transform` method. + * @param predictions predictions output by the model's `transform` method. * @param predictionCol Field in "predictions" which gives the predicted value of the label at * each instance. * @param labelCol Field in "predictions" which gives the true label of each instance. @@ -655,8 +655,11 @@ class LinearRegressionSummary private[regression] ( /** * Standard error of estimated coefficients and intercept. - * * This value is only available when using the "normal" solver. + * + * If [[LinearRegression.fitIntercept]] is set to true, + * then the last element returned corresponds to the intercept. + * * @see [[LinearRegression.solver]] */ lazy val coefficientStandardErrors: Array[Double] = { @@ -679,8 +682,11 @@ class LinearRegressionSummary private[regression] ( /** * T-statistic of estimated coefficients and intercept. - * * This value is only available when using the "normal" solver. + * + * If [[LinearRegression.fitIntercept]] is set to true, + * then the last element returned corresponds to the intercept. + * * @see [[LinearRegression.solver]] */ lazy val tValues: Array[Double] = { @@ -699,8 +705,11 @@ class LinearRegressionSummary private[regression] ( /** * Two-sided p-value of estimated coefficients and intercept. - * * This value is only available when using the "normal" solver. + * + * If [[LinearRegression.fitIntercept]] is set to true, + * then the last element returned corresponds to the intercept. + * * @see [[LinearRegression.solver]] */ lazy val pValues: Array[Double] = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala index d365655674..b6334762c7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala @@ -30,22 +30,6 @@ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel -/** - * A package that implements - * [[http://en.wikipedia.org/wiki/Gradient_boosting Stochastic Gradient Boosting]] - * for regression and binary classification. - * - * The implementation is based upon: - * J.H. Friedman. "Stochastic Gradient Boosting." 1999. - * - * Notes on Gradient Boosting vs. TreeBoost: - * - This implementation is for Stochastic Gradient Boosting, not for TreeBoost. - * - Both algorithms learn tree ensembles by minimizing loss functions. - * - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes - * based on the loss function, whereas the original gradient boosting method does not. - * - When the loss is SquaredError, these methods give the same result, but they could differ - * for other loss functions. - */ private[spark] object GradientBoostedTrees extends Logging { /** diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d98919b3c6..e64c7a392b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -291,7 +291,7 @@ class LogisticRegressionSummary(JavaCallable): @since("2.0.0") def probabilityCol(self): """ - Field in "predictions" which gives the calibrated probability + Field in "predictions" which gives the probability of each class as a vector. """ return self._call_java("probabilityCol") diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index f6c5d130dd..1c18df3b27 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -331,6 +331,9 @@ class LinearRegressionSummary(JavaCallable): Standard error of estimated coefficients and intercept. This value is only available when using the "normal" solver. + If :py:attr:`LinearRegression.fitIntercept` is set to True, + then the last element returned corresponds to the intercept. + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("coefficientStandardErrors") @@ -342,6 +345,9 @@ class LinearRegressionSummary(JavaCallable): T-statistic of estimated coefficients and intercept. This value is only available when using the "normal" solver. + If :py:attr:`LinearRegression.fitIntercept` is set to True, + then the last element returned corresponds to the intercept. + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("tValues") @@ -353,6 +359,9 @@ class LinearRegressionSummary(JavaCallable): Two-sided p-value of estimated coefficients and intercept. This value is only available when using the "normal" solver. + If :py:attr:`LinearRegression.fitIntercept` is set to True, + then the last element returned corresponds to the intercept. + .. seealso:: :py:attr:`LinearRegression.solver` """ return self._call_java("pValues") diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index da00f317b3..ea8c61b7ef 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -588,6 +588,8 @@ class TrainValidationSplit(Estimator, ValidatorParams, MLReadable, MLWritable): class TrainValidationSplitModel(Model, ValidatorParams, MLReadable, MLWritable): """ Model from train validation split. + + .. versionadded:: 2.0.0 """ def __init__(self, bestModel): diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index a2cf2296fb..bbeb6cfe6f 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -249,7 +249,7 @@ class JavaModel(Model, JavaCallable, JavaTransformer): """ Initialize this instance with a Java model object. Subclasses should call this constructor, initialize params, - and then call _transformer_params_from_java. + and then call _transfer_params_from_java. This instance can be instantiated without specifying java_model, it will be assigned after that, but this scenario only used by diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 5bc0034cb0..223122300d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -154,6 +154,8 @@ object functions { /** * Aggregate function: returns the approximate number of distinct items in a group. * + * @param rsd maximum estimation error allowed (default = 0.05) + * * @group agg_funcs * @since 1.3.0 */ @@ -164,6 +166,8 @@ object functions { /** * Aggregate function: returns the approximate number of distinct items in a group. * + * @param rsd maximum estimation error allowed (default = 0.05) + * * @group agg_funcs * @since 1.3.0 */ -- cgit v1.2.3