aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2016-04-08 20:15:44 -0700
committerDB Tsai <dbt@netflix.com>2016-04-08 20:15:44 -0700
commitd7af736b2cf6c392b87e7b45c2d2219ef06979eb (patch)
tree1143b76a2757b75e3ce671e65f68ce8c853fc5d0
parent813e96e6faee44079eb52acbdc6c8aa58fb8d191 (diff)
downloadspark-d7af736b2cf6c392b87e7b45c2d2219ef06979eb.tar.gz
spark-d7af736b2cf6c392b87e7b45c2d2219ef06979eb.tar.bz2
spark-d7af736b2cf6c392b87e7b45c2d2219ef06979eb.zip
[SPARK-14498][ML][PYTHON][SQL] Many cleanups to ML and ML-related docs
## What changes were proposed in this pull request? Cleanups to documentation. No changes to code. * GBT docs: Move Scala doc for private object GradientBoostedTrees to public docs for GBTClassifier,Regressor * GLM regParam: needs doc saying it is for L2 only * TrainValidationSplitModel: add .. versionadded:: 2.0.0 * Rename “_transformer_params_from_java” to “_transfer_params_from_java” * LogReg Summary classes: “probability” col should not say “calibrated” * LR summaries: coefficientStandardErrors —> document that intercept stderr comes last. Same for t,p-values * approxCountDistinct: Document meaning of “rsd" argument. * LDA: note which params are for online LDA only ## How was this patch tested? Doc build Author: Joseph K. Bradley <joseph@databricks.com> Closes #12266 from jkbradley/ml-doc-cleanups.
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala34
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala17
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala19
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala16
-rw-r--r--python/pyspark/ml/classification.py2
-rw-r--r--python/pyspark/ml/regression.py9
-rw-r--r--python/pyspark/ml/tuning.py2
-rw-r--r--python/pyspark/ml/wrapper.py2
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/functions.scala4
12 files changed, 97 insertions, 42 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index bee90fb3a5..a2150fbcc3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -42,6 +42,16 @@ import org.apache.spark.sql.functions._
* learning algorithm for classification.
* It supports binary labels, as well as both continuous and categorical features.
* Note: Multiclass labels are not currently supported.
+ *
+ * The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
+ *
+ * Notes on Gradient Boosting vs. TreeBoost:
+ * - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
+ * - Both algorithms learn tree ensembles by minimizing loss functions.
+ * - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
+ * based on the loss function, whereas the original gradient boosting method does not.
+ * - We expect to implement TreeBoost in the future:
+ * [https://issues.apache.org/jira/browse/SPARK-4240]
*/
@Since("1.4.0")
@Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 37182928cc..268c3e32c3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -774,10 +774,10 @@ sealed trait LogisticRegressionTrainingSummary extends LogisticRegressionSummary
*/
sealed trait LogisticRegressionSummary extends Serializable {
- /** Dataframe outputted by the model's `transform` method. */
+ /** Dataframe output by the model's `transform` method. */
def predictions: DataFrame
- /** Field in "predictions" which gives the calibrated probability of each class as a vector. */
+ /** Field in "predictions" which gives the probability of each class as a vector. */
def probabilityCol: String
/** Field in "predictions" which gives the true label of each instance (if available). */
@@ -792,8 +792,8 @@ sealed trait LogisticRegressionSummary extends Serializable {
* :: Experimental ::
* Logistic regression training results.
*
- * @param predictions dataframe outputted by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the calibrated probability of
+ * @param predictions dataframe output by the model's `transform` method.
+ * @param probabilityCol field in "predictions" which gives the probability of
* each class as a vector.
* @param labelCol field in "predictions" which gives the true label of each instance.
* @param featuresCol field in "predictions" which gives the features of each instance as a vector.
@@ -816,8 +816,8 @@ class BinaryLogisticRegressionTrainingSummary private[classification] (
* :: Experimental ::
* Binary Logistic regression results for a given model.
*
- * @param predictions dataframe outputted by the model's `transform` method.
- * @param probabilityCol field in "predictions" which gives the calibrated probability of
+ * @param predictions dataframe output by the model's `transform` method.
+ * @param probabilityCol field in "predictions" which gives the probability of
* each class as a vector.
* @param labelCol field in "predictions" which gives the true label of each instance.
* @param featuresCol field in "predictions" which gives the features of each instance as a vector.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 727b724708..89a7a4ccf6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -190,6 +190,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
def getTopicDistributionCol: String = $(topicDistributionCol)
/**
+ * For Online optimizer only: [[optimizer]] = "online".
+ *
* A (positive) learning parameter that downweights early iterations. Larger values make early
* iterations count less.
* This is called "tau0" in the Online LDA paper (Hoffman et al., 2010)
@@ -198,8 +200,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group expertParam
*/
@Since("1.6.0")
- final val learningOffset = new DoubleParam(this, "learningOffset", "A (positive) learning" +
- " parameter that downweights early iterations. Larger values make early iterations count less.",
+ final val learningOffset = new DoubleParam(this, "learningOffset", "(For online optimizer)" +
+ " A (positive) learning parameter that downweights early iterations. Larger values make early" +
+ " iterations count less.",
ParamValidators.gt(0))
/** @group expertGetParam */
@@ -207,6 +210,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
def getLearningOffset: Double = $(learningOffset)
/**
+ * For Online optimizer only: [[optimizer]] = "online".
+ *
* Learning rate, set as an exponential decay rate.
* This should be between (0.5, 1.0] to guarantee asymptotic convergence.
* This is called "kappa" in the Online LDA paper (Hoffman et al., 2010).
@@ -215,15 +220,17 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group expertParam
*/
@Since("1.6.0")
- final val learningDecay = new DoubleParam(this, "learningDecay", "Learning rate, set as an" +
- " exponential decay rate. This should be between (0.5, 1.0] to guarantee asymptotic" +
- " convergence.", ParamValidators.gt(0))
+ final val learningDecay = new DoubleParam(this, "learningDecay", "(For online optimizer)" +
+ " Learning rate, set as an exponential decay rate. This should be between (0.5, 1.0] to" +
+ " guarantee asymptotic convergence.", ParamValidators.gt(0))
/** @group expertGetParam */
@Since("1.6.0")
def getLearningDecay: Double = $(learningDecay)
/**
+ * For Online optimizer only: [[optimizer]] = "online".
+ *
* Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent,
* in range (0, 1].
*
@@ -239,8 +246,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group param
*/
@Since("1.6.0")
- final val subsamplingRate = new DoubleParam(this, "subsamplingRate", "Fraction of the corpus" +
- " to be sampled and used in each iteration of mini-batch gradient descent, in range (0, 1].",
+ final val subsamplingRate = new DoubleParam(this, "subsamplingRate", "(For online optimizer)" +
+ " Fraction of the corpus to be sampled and used in each iteration of mini-batch" +
+ " gradient descent, in range (0, 1].",
ParamValidators.inRange(0.0, 1.0, lowerInclusive = false, upperInclusive = true))
/** @group getParam */
@@ -248,6 +256,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
def getSubsamplingRate: Double = $(subsamplingRate)
/**
+ * For Online optimizer only (currently): [[optimizer]] = "online".
+ *
* Indicates whether the docConcentration (Dirichlet parameter for
* document-topic distribution) will be optimized during training.
* Setting this to true will make the model more expressive and fit the training data better.
@@ -257,15 +267,17 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*/
@Since("1.6.0")
final val optimizeDocConcentration = new BooleanParam(this, "optimizeDocConcentration",
- "Indicates whether the docConcentration (Dirichlet parameter for document-topic" +
- " distribution) will be optimized during training.")
+ "(For online optimizer only, currently) Indicates whether the docConcentration" +
+ " (Dirichlet parameter for document-topic distribution) will be optimized during training.")
/** @group expertGetParam */
@Since("1.6.0")
def getOptimizeDocConcentration: Boolean = $(optimizeDocConcentration)
/**
- * For EM optimizer, if using checkpointing, this indicates whether to keep the last
+ * For EM optimizer only: [[optimizer]] = "em".
+ *
+ * If using checkpointing, this indicates whether to keep the last
* checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can
* cause failures if a data partition is lost, so set this bit with care.
* Note that checkpoints will be cleaned up via reference counting, regardless.
@@ -279,7 +291,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*/
@Since("2.0.0")
final val keepLastCheckpoint = new BooleanParam(this, "keepLastCheckpoint",
- "For EM optimizer, if using checkpointing, this indicates whether to keep the last" +
+ "(For EM optimizer) If using checkpointing, this indicates whether to keep the last" +
" checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can" +
" cause failures if a data partition is lost, so set this bit with care.")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index cef7c643d7..8eb2984f7b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -41,6 +41,18 @@ import org.apache.spark.sql.functions._
* [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
* learning algorithm for regression.
* It supports both continuous and categorical features.
+ *
+ * The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
+ *
+ * Notes on Gradient Boosting vs. TreeBoost:
+ * - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
+ * - Both algorithms learn tree ensembles by minimizing loss functions.
+ * - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
+ * based on the loss function, whereas the original gradient boosting method does not.
+ * - When the loss is SquaredError, these methods give the same result, but they could differ
+ * for other loss functions.
+ * - We expect to implement TreeBoost in the future:
+ * [https://issues.apache.org/jira/browse/SPARK-4240]
*/
@Since("1.4.0")
@Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index a40d3731cb..05bf64591b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -165,7 +165,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
setDefault(tol -> 1E-6)
/**
- * Sets the regularization parameter.
+ * Sets the regularization parameter for L2 regularization.
+ * The regularization term is
+ * {{{
+ * 0.5 * regParam * L2norm(coefficients)^2
+ * }}}
* Default is 0.0.
* @group setParam
*/
@@ -772,7 +776,7 @@ object GeneralizedLinearRegressionModel extends MLReadable[GeneralizedLinearRegr
* :: Experimental ::
* Summarizing Generalized Linear regression Fits.
*
- * @param predictions predictions outputted by the model's `transform` method
+ * @param predictions predictions output by the model's `transform` method
* @param predictionCol field in "predictions" which gives the prediction value of each instance
* @param model the model that should be summarized
* @param diagInvAtWA diagonal of matrix (A^T * W * A)^-1 in the last iteration
@@ -933,6 +937,9 @@ class GeneralizedLinearRegressionSummary private[regression] (
/**
* Standard error of estimated coefficients and intercept.
+ *
+ * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+ * then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val coefficientStandardErrors: Array[Double] = {
@@ -941,6 +948,9 @@ class GeneralizedLinearRegressionSummary private[regression] (
/**
* T-statistic of estimated coefficients and intercept.
+ *
+ * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+ * then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val tValues: Array[Double] = {
@@ -954,6 +964,9 @@ class GeneralizedLinearRegressionSummary private[regression] (
/**
* Two-sided p-value of estimated coefficients and intercept.
+ *
+ * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
+ * then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val pValues: Array[Double] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 9619e72a45..aacff4ea47 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -513,7 +513,7 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] {
* Linear regression training results. Currently, the training summary ignores the
* training weights except for the objective trace.
*
- * @param predictions predictions outputted by the model's `transform` method.
+ * @param predictions predictions output by the model's `transform` method.
* @param objectiveHistory objective function (scaled loss + regularization) at each iteration.
*/
@Since("1.5.0")
@@ -549,7 +549,7 @@ class LinearRegressionTrainingSummary private[regression] (
* :: Experimental ::
* Linear regression results evaluated on a dataset.
*
- * @param predictions predictions outputted by the model's `transform` method.
+ * @param predictions predictions output by the model's `transform` method.
* @param predictionCol Field in "predictions" which gives the predicted value of the label at
* each instance.
* @param labelCol Field in "predictions" which gives the true label of each instance.
@@ -655,8 +655,11 @@ class LinearRegressionSummary private[regression] (
/**
* Standard error of estimated coefficients and intercept.
- *
* This value is only available when using the "normal" solver.
+ *
+ * If [[LinearRegression.fitIntercept]] is set to true,
+ * then the last element returned corresponds to the intercept.
+ *
* @see [[LinearRegression.solver]]
*/
lazy val coefficientStandardErrors: Array[Double] = {
@@ -679,8 +682,11 @@ class LinearRegressionSummary private[regression] (
/**
* T-statistic of estimated coefficients and intercept.
- *
* This value is only available when using the "normal" solver.
+ *
+ * If [[LinearRegression.fitIntercept]] is set to true,
+ * then the last element returned corresponds to the intercept.
+ *
* @see [[LinearRegression.solver]]
*/
lazy val tValues: Array[Double] = {
@@ -699,8 +705,11 @@ class LinearRegressionSummary private[regression] (
/**
* Two-sided p-value of estimated coefficients and intercept.
- *
* This value is only available when using the "normal" solver.
+ *
+ * If [[LinearRegression.fitIntercept]] is set to true,
+ * then the last element returned corresponds to the intercept.
+ *
* @see [[LinearRegression.solver]]
*/
lazy val pValues: Array[Double] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index d365655674..b6334762c7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -30,22 +30,6 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
-/**
- * A package that implements
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Stochastic Gradient Boosting]]
- * for regression and binary classification.
- *
- * The implementation is based upon:
- * J.H. Friedman. "Stochastic Gradient Boosting." 1999.
- *
- * Notes on Gradient Boosting vs. TreeBoost:
- * - This implementation is for Stochastic Gradient Boosting, not for TreeBoost.
- * - Both algorithms learn tree ensembles by minimizing loss functions.
- * - TreeBoost (Friedman, 1999) additionally modifies the outputs at tree leaf nodes
- * based on the loss function, whereas the original gradient boosting method does not.
- * - When the loss is SquaredError, these methods give the same result, but they could differ
- * for other loss functions.
- */
private[spark] object GradientBoostedTrees extends Logging {
/**
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index d98919b3c6..e64c7a392b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -291,7 +291,7 @@ class LogisticRegressionSummary(JavaCallable):
@since("2.0.0")
def probabilityCol(self):
"""
- Field in "predictions" which gives the calibrated probability
+ Field in "predictions" which gives the probability
of each class as a vector.
"""
return self._call_java("probabilityCol")
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index f6c5d130dd..1c18df3b27 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -331,6 +331,9 @@ class LinearRegressionSummary(JavaCallable):
Standard error of estimated coefficients and intercept.
This value is only available when using the "normal" solver.
+ If :py:attr:`LinearRegression.fitIntercept` is set to True,
+ then the last element returned corresponds to the intercept.
+
.. seealso:: :py:attr:`LinearRegression.solver`
"""
return self._call_java("coefficientStandardErrors")
@@ -342,6 +345,9 @@ class LinearRegressionSummary(JavaCallable):
T-statistic of estimated coefficients and intercept.
This value is only available when using the "normal" solver.
+ If :py:attr:`LinearRegression.fitIntercept` is set to True,
+ then the last element returned corresponds to the intercept.
+
.. seealso:: :py:attr:`LinearRegression.solver`
"""
return self._call_java("tValues")
@@ -353,6 +359,9 @@ class LinearRegressionSummary(JavaCallable):
Two-sided p-value of estimated coefficients and intercept.
This value is only available when using the "normal" solver.
+ If :py:attr:`LinearRegression.fitIntercept` is set to True,
+ then the last element returned corresponds to the intercept.
+
.. seealso:: :py:attr:`LinearRegression.solver`
"""
return self._call_java("pValues")
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index da00f317b3..ea8c61b7ef 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -588,6 +588,8 @@ class TrainValidationSplit(Estimator, ValidatorParams, MLReadable, MLWritable):
class TrainValidationSplitModel(Model, ValidatorParams, MLReadable, MLWritable):
"""
Model from train validation split.
+
+ .. versionadded:: 2.0.0
"""
def __init__(self, bestModel):
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index a2cf2296fb..bbeb6cfe6f 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -249,7 +249,7 @@ class JavaModel(Model, JavaCallable, JavaTransformer):
"""
Initialize this instance with a Java model object.
Subclasses should call this constructor, initialize params,
- and then call _transformer_params_from_java.
+ and then call _transfer_params_from_java.
This instance can be instantiated without specifying java_model,
it will be assigned after that, but this scenario only used by
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5bc0034cb0..223122300d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -154,6 +154,8 @@ object functions {
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
+ * @param rsd maximum estimation error allowed (default = 0.05)
+ *
* @group agg_funcs
* @since 1.3.0
*/
@@ -164,6 +166,8 @@ object functions {
/**
* Aggregate function: returns the approximate number of distinct items in a group.
*
+ * @param rsd maximum estimation error allowed (default = 0.05)
+ *
* @group agg_funcs
* @since 1.3.0
*/