diff options
author | Bryan Cutler <bjcutler@us.ibm.com> | 2015-08-18 14:58:30 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-08-18 14:58:30 -0700 |
commit | 1dbffba37a84c62202befd3911d25888f958191d (patch) | |
tree | 0aa21cdd020476a65d04495305374925f0051d21 | |
parent | 492ac1facbc79ee251d45cff315598ec9935a0e2 (diff) | |
download | spark-1dbffba37a84c62202befd3911d25888f958191d.tar.gz spark-1dbffba37a84c62202befd3911d25888f958191d.tar.bz2 spark-1dbffba37a84c62202befd3911d25888f958191d.zip |
[SPARK-8924] [MLLIB, DOCUMENTATION] Added @since tags to mllib.tree
Added since tags to mllib.tree
Author: Bryan Cutler <bjcutler@us.ibm.com>
Closes #7380 from BryanCutler/sinceTag-mllibTree-8924.
24 files changed, 157 insertions, 1 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index cecd1fed89..e5200b86fd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -43,6 +43,7 @@ import org.apache.spark.util.random.XORShiftRandom * @param strategy The configuration parameters for the tree algorithm which specify the type * of algorithm (classification, regression, etc.), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. + * @since 1.0.0 */ @Experimental class DecisionTree (private val strategy: Strategy) extends Serializable with Logging { @@ -53,6 +54,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo * Method to train a decision tree model over an RDD * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] * @return DecisionTreeModel that can be used for prediction + * @since 1.2.0 */ def run(input: RDD[LabeledPoint]): DecisionTreeModel = { // Note: random seed will not be used since numTrees = 1. @@ -62,6 +64,9 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo } } +/** + * @since 1.0.0 + */ object DecisionTree extends Serializable with Logging { /** @@ -79,6 +84,7 @@ object DecisionTree extends Serializable with Logging { * of algorithm (classification, regression, etc.), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. * @return DecisionTreeModel that can be used for prediction + * @since 1.0.0 */ def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = { new DecisionTree(strategy).run(input) @@ -100,6 +106,7 @@ object DecisionTree extends Serializable with Logging { * @param maxDepth Maximum depth of the tree. * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. * @return DecisionTreeModel that can be used for prediction + * @since 1.0.0 */ def train( input: RDD[LabeledPoint], @@ -127,6 +134,7 @@ object DecisionTree extends Serializable with Logging { * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. * @param numClasses number of classes for classification. Default value of 2. * @return DecisionTreeModel that can be used for prediction + * @since 1.2.0 */ def train( input: RDD[LabeledPoint], @@ -160,6 +168,7 @@ object DecisionTree extends Serializable with Logging { * E.g., an entry (n -> k) indicates that feature n is categorical * with k categories indexed from 0: {0, 1, ..., k-1}. * @return DecisionTreeModel that can be used for prediction + * @since 1.0.0 */ def train( input: RDD[LabeledPoint], @@ -192,6 +201,7 @@ object DecisionTree extends Serializable with Logging { * @param maxBins maximum number of bins used for splitting features * (suggested value: 32) * @return DecisionTreeModel that can be used for prediction + * @since 1.1.0 */ def trainClassifier( input: RDD[LabeledPoint], @@ -207,6 +217,7 @@ object DecisionTree extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]] + * @since 1.1.0 */ def trainClassifier( input: JavaRDD[LabeledPoint], @@ -236,6 +247,7 @@ object DecisionTree extends Serializable with Logging { * @param maxBins maximum number of bins used for splitting features * (suggested value: 32) * @return DecisionTreeModel that can be used for prediction + * @since 1.1.0 */ def trainRegressor( input: RDD[LabeledPoint], @@ -249,6 +261,7 @@ object DecisionTree extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]] + * @since 1.1.0 */ def trainRegressor( input: JavaRDD[LabeledPoint], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index 9ce6faa137..1436170986 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -48,6 +48,7 @@ import org.apache.spark.storage.StorageLevel * for other loss functions. * * @param boostingStrategy Parameters for the gradient boosting algorithm. + * @since 1.2.0 */ @Experimental class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) @@ -57,6 +58,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) * Method to train a gradient boosting model * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * @return a gradient boosted trees model that can be used for prediction + * @since 1.2.0 */ def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = { val algo = boostingStrategy.treeStrategy.algo @@ -74,6 +76,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) /** * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]]. + * @since 1.2.0 */ def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = { run(input.rdd) @@ -88,6 +91,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) * E.g., these two datasets could be created from an original dataset * by using [[org.apache.spark.rdd.RDD.randomSplit()]] * @return a gradient boosted trees model that can be used for prediction + * @since 1.4.0 */ def runWithValidation( input: RDD[LabeledPoint], @@ -111,6 +115,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) /** * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]]. + * @since 1.4.0 */ def runWithValidation( input: JavaRDD[LabeledPoint], @@ -119,6 +124,9 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) } } +/** + * @since 1.2.0 + */ object GradientBoostedTrees extends Logging { /** @@ -129,6 +137,7 @@ object GradientBoostedTrees extends Logging { * For regression, labels are real numbers. * @param boostingStrategy Configuration options for the boosting algorithm. * @return a gradient boosted trees model that can be used for prediction + * @since 1.2.0 */ def train( input: RDD[LabeledPoint], @@ -138,6 +147,7 @@ object GradientBoostedTrees extends Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees$#train]] + * @since 1.2.0 */ def train( input: JavaRDD[LabeledPoint], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 069959976a..9f3230656a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -260,6 +260,9 @@ private class RandomForest ( } +/** + * @since 1.2.0 + */ object RandomForest extends Serializable with Logging { /** @@ -276,6 +279,7 @@ object RandomForest extends Serializable with Logging { * if numTrees > 1 (forest) set to "sqrt". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction + * @since 1.2.0 */ def trainClassifier( input: RDD[LabeledPoint], @@ -313,6 +317,7 @@ object RandomForest extends Serializable with Logging { * (suggested value: 100) * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction + * @since 1.2.0 */ def trainClassifier( input: RDD[LabeledPoint], @@ -332,6 +337,7 @@ object RandomForest extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainClassifier]] + * @since 1.2.0 */ def trainClassifier( input: JavaRDD[LabeledPoint], @@ -362,6 +368,7 @@ object RandomForest extends Serializable with Logging { * if numTrees > 1 (forest) set to "onethird". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction + * @since 1.2.0 */ def trainRegressor( input: RDD[LabeledPoint], @@ -398,6 +405,7 @@ object RandomForest extends Serializable with Logging { * (suggested value: 100) * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction + * @since 1.2.0 */ def trainRegressor( input: RDD[LabeledPoint], @@ -416,6 +424,7 @@ object RandomForest extends Serializable with Logging { /** * Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainRegressor]] + * @since 1.2.0 */ def trainRegressor( input: JavaRDD[LabeledPoint], @@ -433,6 +442,7 @@ object RandomForest extends Serializable with Logging { /** * List of supported feature subset sampling strategies. + * @since 1.2.0 */ val supportedFeatureSubsetStrategies: Array[String] = Array("auto", "all", "sqrt", "log2", "onethird") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index b609925997..d9a49aa71f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental /** * :: Experimental :: * Enum to select the algorithm for the decision tree + * @since 1.0.0 */ @Experimental object Algo extends Enumeration { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 50fe2ac53d..88e5f57e9a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -38,6 +38,7 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss} * validation input between two iterations is less than the validationTol * then stop. Ignored when * [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used. + * @since 1.2.0 */ @Experimental case class BoostingStrategy( @@ -70,6 +71,9 @@ case class BoostingStrategy( } } +/** + * @since 1.2.0 + */ @Experimental object BoostingStrategy { @@ -77,6 +81,7 @@ object BoostingStrategy { * Returns default configuration for the boosting algorithm * @param algo Learning goal. Supported: "Classification" or "Regression" * @return Configuration for boosting algorithm + * @since 1.2.0 */ def defaultParams(algo: String): BoostingStrategy = { defaultParams(Algo.fromString(algo)) @@ -88,6 +93,7 @@ object BoostingStrategy { * [[org.apache.spark.mllib.tree.configuration.Algo.Classification]], * [[org.apache.spark.mllib.tree.configuration.Algo.Regression]] * @return Configuration for boosting algorithm + * @since 1.3.0 */ def defaultParams(algo: Algo): BoostingStrategy = { val treeStrategy = Strategy.defaultStrategy(algo) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala index f4c8772327..0684cafa48 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala @@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental /** * :: Experimental :: * Enum to describe whether a feature is "continuous" or "categorical" + * @since 1.0.0 */ @Experimental object FeatureType extends Enumeration { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala index 7da976e55a..2daa63c4d2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala @@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental /** * :: Experimental :: * Enum for selecting the quantile calculation strategy + * @since 1.0.0 */ @Experimental object QuantileStrategy extends Enumeration { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index de2c784809..7ae25a88bf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -66,6 +66,7 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._ * E.g. 10 means that the cache will get checkpointed every 10 updates. If * the checkpoint directory is not set in * [[org.apache.spark.SparkContext]], this setting is ignored. + * @since 1.0.0 */ @Experimental class Strategy ( @@ -83,16 +84,23 @@ class Strategy ( @BeanProperty var useNodeIdCache: Boolean = false, @BeanProperty var checkpointInterval: Int = 10) extends Serializable { + /** + * @since 1.2.0 + */ def isMulticlassClassification: Boolean = { algo == Classification && numClasses > 2 } + /** + * @since 1.2.0 + */ def isMulticlassWithCategoricalFeatures: Boolean = { isMulticlassClassification && (categoricalFeaturesInfo.size > 0) } /** * Java-friendly constructor for [[org.apache.spark.mllib.tree.configuration.Strategy]] + * @since 1.1.0 */ def this( algo: Algo, @@ -107,6 +115,7 @@ class Strategy ( /** * Sets Algorithm using a String. + * @since 1.2.0 */ def setAlgo(algo: String): Unit = algo match { case "Classification" => setAlgo(Classification) @@ -115,6 +124,7 @@ class Strategy ( /** * Sets categoricalFeaturesInfo using a Java Map. + * @since 1.2.0 */ def setCategoricalFeaturesInfo( categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer]): Unit = { @@ -162,7 +172,10 @@ class Strategy ( s"$subsamplingRate") } - /** Returns a shallow copy of this instance. */ + /** + * Returns a shallow copy of this instance. + * @since 1.2.0 + */ def copy: Strategy = { new Strategy(algo, impurity, maxDepth, numClasses, maxBins, quantileCalculationStrategy, categoricalFeaturesInfo, minInstancesPerNode, minInfoGain, @@ -170,12 +183,16 @@ class Strategy ( } } +/** + * @since 1.2.0 + */ @Experimental object Strategy { /** * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]] * @param algo "Classification" or "Regression" + * @since 1.2.0 */ def defaultStrategy(algo: String): Strategy = { defaultStrategy(Algo.fromString(algo)) @@ -184,6 +201,7 @@ object Strategy { /** * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]] * @param algo Algo.Classification or Algo.Regression + * @since 1.3.0 */ def defaultStrategy(algo: Algo): Strategy = algo match { case Algo.Classification => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala index 0768204c33..0b6c7266de 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala @@ -23,6 +23,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental} * :: Experimental :: * Class for calculating [[http://en.wikipedia.org/wiki/Binary_entropy_function entropy]] during * binary classification. + * @since 1.0.0 */ @Experimental object Entropy extends Impurity { @@ -35,6 +36,7 @@ object Entropy extends Impurity { * @param counts Array[Double] with counts for each label * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 + * @since 1.1.0 */ @DeveloperApi override def calculate(counts: Array[Double], totalCount: Double): Double = { @@ -62,6 +64,7 @@ object Entropy extends Impurity { * @param sum sum of labels * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 + * @since 1.0.0 */ @DeveloperApi override def calculate(count: Double, sum: Double, sumSquares: Double): Double = @@ -70,6 +73,7 @@ object Entropy extends Impurity { /** * Get this impurity instance. * This is useful for passing impurity parameters to a Strategy in Java. + * @since 1.1.0 */ def instance: this.type = this diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala index d0077db683..3b0be42883 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala @@ -24,6 +24,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental} * Class for calculating the * [[http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity Gini impurity]] * during binary classification. + * @since 1.0.0 */ @Experimental object Gini extends Impurity { @@ -34,6 +35,7 @@ object Gini extends Impurity { * @param counts Array[Double] with counts for each label * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 + * @since 1.1.0 */ @DeveloperApi override def calculate(counts: Array[Double], totalCount: Double): Double = { @@ -58,6 +60,7 @@ object Gini extends Impurity { * @param sum sum of labels * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 + * @since 1.0.0 */ @DeveloperApi override def calculate(count: Double, sum: Double, sumSquares: Double): Double = @@ -66,6 +69,7 @@ object Gini extends Impurity { /** * Get this impurity instance. * This is useful for passing impurity parameters to a Strategy in Java. + * @since 1.1.0 */ def instance: this.type = this diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala index 86cee7e430..dd29740005 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala @@ -25,6 +25,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental} * This trait is used for * (a) setting the impurity parameter in [[org.apache.spark.mllib.tree.configuration.Strategy]] * (b) calculating impurity values from sufficient statistics. + * @since 1.0.0 */ @Experimental trait Impurity extends Serializable { @@ -35,6 +36,7 @@ trait Impurity extends Serializable { * @param counts Array[Double] with counts for each label * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 + * @since 1.1.0 */ @DeveloperApi def calculate(counts: Array[Double], totalCount: Double): Double @@ -46,6 +48,7 @@ trait Impurity extends Serializable { * @param sum sum of labels * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 + * @since 1.0.0 */ @DeveloperApi def calculate(count: Double, sum: Double, sumSquares: Double): Double diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala index 04d0cd24e6..adbe05811f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala @@ -22,6 +22,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental} /** * :: Experimental :: * Class for calculating variance during regression + * @since 1.0.0 */ @Experimental object Variance extends Impurity { @@ -32,6 +33,7 @@ object Variance extends Impurity { * @param counts Array[Double] with counts for each label * @param totalCount sum of counts for all labels * @return information value, or 0 if totalCount = 0 + * @since 1.1.0 */ @DeveloperApi override def calculate(counts: Array[Double], totalCount: Double): Double = @@ -44,6 +46,7 @@ object Variance extends Impurity { * @param sum sum of labels * @param sumSquares summation of squares of the labels * @return information value, or 0 if count = 0 + * @since 1.0.0 */ @DeveloperApi override def calculate(count: Double, sum: Double, sumSquares: Double): Double = { @@ -57,6 +60,7 @@ object Variance extends Impurity { /** * Get this impurity instance. * This is useful for passing impurity parameters to a Strategy in Java. + * @since 1.0.0 */ def instance: this.type = this diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala index 2bdef73c4a..c6e3d0d824 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala @@ -29,6 +29,7 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel * The absolute (L1) error is defined as: * |y - F(x)| * where y is the label and F(x) is the model prediction for features x. + * @since 1.2.0 */ @DeveloperApi object AbsoluteError extends Loss { @@ -40,6 +41,7 @@ object AbsoluteError extends Loss { * @param prediction Predicted label. * @param label True label. * @return Loss gradient + * @since 1.2.0 */ override def gradient(prediction: Double, label: Double): Double = { if (label - prediction < 0) 1.0 else -1.0 diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala index 778c24526d..eee58445a1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala @@ -31,6 +31,7 @@ import org.apache.spark.mllib.util.MLUtils * The log loss is defined as: * 2 log(1 + exp(-2 y F(x))) * where y is a label in {-1, 1} and F(x) is the model prediction for features x. + * @since 1.2.0 */ @DeveloperApi object LogLoss extends Loss { @@ -42,6 +43,7 @@ object LogLoss extends Loss { * @param prediction Predicted label. * @param label True label. * @return Loss gradient + * @since 1.2.0 */ override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala index 64ffccbce0..7c9fb92464 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala @@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * Trait for adding "pluggable" loss functions for the gradient boosting algorithm. + * @since 1.2.0 */ @DeveloperApi trait Loss extends Serializable { @@ -35,6 +36,7 @@ trait Loss extends Serializable { * @param prediction Predicted feature * @param label true label. * @return Loss gradient. + * @since 1.2.0 */ def gradient(prediction: Double, label: Double): Double @@ -45,6 +47,7 @@ trait Loss extends Serializable { * @param model Model of the weak learner. * @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * @return Measure of model error on data + * @since 1.2.0 */ def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = { data.map(point => computeError(model.predict(point.features), point.label)).mean() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala index 42c9ead988..47dc94cde7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala @@ -17,8 +17,14 @@ package org.apache.spark.mllib.tree.loss +/** + * @since 1.2.0 + */ object Losses { + /** + * @since 1.2.0 + */ def fromString(name: String): Loss = name match { case "leastSquaresError" => SquaredError case "leastAbsoluteError" => AbsoluteError diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala index 011a5d5742..ff8903d695 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala @@ -29,6 +29,7 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel * The squared (L2) error is defined as: * (y - F(x))**2 * where y is the label and F(x) is the model prediction for features x. + * @since 1.2.0 */ @DeveloperApi object SquaredError extends Loss { @@ -40,6 +41,7 @@ object SquaredError extends Loss { * @param prediction Predicted label. * @param label True label. * @return Loss gradient + * @since 1.2.0 */ override def gradient(prediction: Double, label: Double): Double = { - 2.0 * (label - prediction) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala index f2c78bbabf..0f386a2660 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala @@ -40,6 +40,7 @@ import org.apache.spark.util.Utils * This model stores the decision tree structure and parameters. * @param topNode root node * @param algo algorithm type -- classification or regression + * @since 1.0.0 */ @Experimental class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable with Saveable { @@ -49,6 +50,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * * @param features array representing a single data point * @return Double prediction from the trained model + * @since 1.0.0 */ def predict(features: Vector): Double = { topNode.predict(features) @@ -59,6 +61,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * * @param features RDD representing data points to be predicted * @return RDD of predictions for each of the given data points + * @since 1.0.0 */ def predict(features: RDD[Vector]): RDD[Double] = { features.map(x => predict(x)) @@ -69,6 +72,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable * * @param features JavaRDD representing data points to be predicted * @return JavaRDD of predictions for each of the given data points + * @since 1.2.0 */ def predict(features: JavaRDD[Vector]): JavaRDD[Double] = { predict(features.rdd) @@ -76,6 +80,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable /** * Get number of nodes in tree, including leaf nodes. + * @since 1.1.0 */ def numNodes: Int = { 1 + topNode.numDescendants @@ -84,6 +89,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable /** * Get depth of tree. * E.g.: Depth 0 means 1 leaf node. Depth 1 means 1 internal node and 2 leaf nodes. + * @since 1.1.0 */ def depth: Int = { topNode.subtreeDepth @@ -109,6 +115,12 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable header + topNode.subtreeToString(2) } + /** + * @param sc Spark context used to save model data. + * @param path Path specifying the directory in which to save this model. + * If the directory already exists, this method throws an exception. + * @since 1.3.0 + */ override def save(sc: SparkContext, path: String): Unit = { DecisionTreeModel.SaveLoadV1_0.save(sc, path, this) } @@ -116,6 +128,9 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable override protected def formatVersion: String = DecisionTreeModel.formatVersion } +/** + * @since 1.3.0 + */ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging { private[spark] def formatVersion: String = "1.0" @@ -297,6 +312,13 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging { } } + /** + * + * @param sc Spark context used for loading model files. + * @param path Path specifying the directory to which the model was saved. + * @return Model instance + * @since 1.3.0 + */ override def load(sc: SparkContext, path: String): DecisionTreeModel = { implicit val formats = DefaultFormats val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala index 508bf9c1bd..23f0363639 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala @@ -29,6 +29,7 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator * @param rightImpurity right node impurity * @param leftPredict left node predict * @param rightPredict right node predict + * @since 1.0.0 */ @DeveloperApi class InformationGainStats( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala index a6d1398fc2..aca3350c2e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala @@ -38,6 +38,7 @@ import org.apache.spark.mllib.linalg.Vector * @param leftNode left child * @param rightNode right child * @param stats information gain stats + * @since 1.0.0 */ @DeveloperApi class Node ( @@ -58,6 +59,7 @@ class Node ( /** * build the left node and right nodes if not leaf * @param nodes array of nodes + * @since 1.0.0 */ @deprecated("build should no longer be used since trees are constructed on-the-fly in training", "1.2.0") @@ -79,6 +81,7 @@ class Node ( * predict value if node is not leaf * @param features feature value * @return predicted value + * @since 1.1.0 */ def predict(features: Vector) : Double = { if (isLeaf) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala index 5cbe7c280d..be819b59e7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala @@ -23,6 +23,7 @@ import org.apache.spark.annotation.DeveloperApi * Predicted value for a node * @param predict predicted value * @param prob probability of the label (classification only) + * @since 1.2.0 */ @DeveloperApi class Predict( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala index be6c9b3de5..18d40530ae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala @@ -30,6 +30,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType * Split left if feature <= threshold, else right. * @param featureType type of feature -- categorical or continuous * @param categories Split left if categorical feature value is in this set, else right. + * @since 1.0.0 */ @DeveloperApi case class Split( diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala index 905c5fb42b..0c629b12a8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala @@ -45,6 +45,7 @@ import org.apache.spark.util.Utils * * @param algo algorithm for the ensemble model, either Classification or Regression * @param trees tree ensembles + * @since 1.2.0 */ @Experimental class RandomForestModel(override val algo: Algo, override val trees: Array[DecisionTreeModel]) @@ -54,6 +55,13 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis require(trees.forall(_.algo == algo)) + /** + * + * @param sc Spark context used to save model data. + * @param path Path specifying the directory in which to save this model. + * If the directory already exists, this method throws an exception. + * @since 1.3.0 + */ override def save(sc: SparkContext, path: String): Unit = { TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this, RandomForestModel.SaveLoadV1_0.thisClassName) @@ -62,10 +70,20 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis override protected def formatVersion: String = RandomForestModel.formatVersion } +/** + * @since 1.3.0 + */ object RandomForestModel extends Loader[RandomForestModel] { private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion + /** + * + * @param sc Spark context used for loading model files. + * @param path Path specifying the directory to which the model was saved. + * @return Model instance + * @since 1.3.0 + */ override def load(sc: SparkContext, path: String): RandomForestModel = { val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName @@ -96,6 +114,7 @@ object RandomForestModel extends Loader[RandomForestModel] { * @param algo algorithm for the ensemble model, either Classification or Regression * @param trees tree ensembles * @param treeWeights tree ensemble weights + * @since 1.2.0 */ @Experimental class GradientBoostedTreesModel( @@ -107,6 +126,12 @@ class GradientBoostedTreesModel( require(trees.length == treeWeights.length) + /** + * @param sc Spark context used to save model data. + * @param path Path specifying the directory in which to save this model. + * If the directory already exists, this method throws an exception. + * @since 1.3.0 + */ override def save(sc: SparkContext, path: String): Unit = { TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this, GradientBoostedTreesModel.SaveLoadV1_0.thisClassName) @@ -118,6 +143,7 @@ class GradientBoostedTreesModel( * @param loss evaluation metric. * @return an array with index i having the losses or errors for the ensemble * containing the first i+1 trees + * @since 1.4.0 */ def evaluateEachIteration( data: RDD[LabeledPoint], @@ -159,6 +185,9 @@ class GradientBoostedTreesModel( override protected def formatVersion: String = GradientBoostedTreesModel.formatVersion } +/** + * @since 1.3.0 + */ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { /** @@ -170,6 +199,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { * @param loss: evaluation metric. * @return a RDD with each element being a zip of the prediction and error * corresponding to every sample. + * @since 1.4.0 */ def computeInitialPredictionAndError( data: RDD[LabeledPoint], @@ -193,6 +223,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { * @param loss: evaluation metric. * @return a RDD with each element being a zip of the prediction and error * corresponding to each sample. + * @since 1.4.0 */ def updatePredictionError( data: RDD[LabeledPoint], @@ -213,6 +244,12 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] { private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion + /** + * @param sc Spark context used for loading model files. + * @param path Path specifying the directory to which the model was saved. + * @return Model instance + * @since 1.3.0 + */ override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = { val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala index bcaacc1b1f..f520b3a1b7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala @@ -24,6 +24,7 @@ package org.apache.spark.mllib * - information loss calculation with entropy and Gini for classification and * variance for regression, * - both continuous and categorical features. + * @since 1.0.0 */ package object tree { } |