aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorBryan Cutler <bjcutler@us.ibm.com>2015-08-18 14:58:30 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-18 14:58:30 -0700
commit1dbffba37a84c62202befd3911d25888f958191d (patch)
tree0aa21cdd020476a65d04495305374925f0051d21 /mllib
parent492ac1facbc79ee251d45cff315598ec9935a0e2 (diff)
downloadspark-1dbffba37a84c62202befd3911d25888f958191d.tar.gz
spark-1dbffba37a84c62202befd3911d25888f958191d.tar.bz2
spark-1dbffba37a84c62202befd3911d25888f958191d.zip
[SPARK-8924] [MLLIB, DOCUMENTATION] Added @since tags to mllib.tree
Added since tags to mllib.tree Author: Bryan Cutler <bjcutler@us.ibm.com> Closes #7380 from BryanCutler/sinceTag-mllibTree-8924.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala20
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala22
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala37
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala1
24 files changed, 157 insertions, 1 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index cecd1fed89..e5200b86fd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -43,6 +43,7 @@ import org.apache.spark.util.random.XORShiftRandom
* @param strategy The configuration parameters for the tree algorithm which specify the type
* of algorithm (classification, regression, etc.), feature type (continuous,
* categorical), depth of the tree, quantile calculation strategy, etc.
+ * @since 1.0.0
*/
@Experimental
class DecisionTree (private val strategy: Strategy) extends Serializable with Logging {
@@ -53,6 +54,7 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
* Method to train a decision tree model over an RDD
* @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.2.0
*/
def run(input: RDD[LabeledPoint]): DecisionTreeModel = {
// Note: random seed will not be used since numTrees = 1.
@@ -62,6 +64,9 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
}
}
+/**
+ * @since 1.0.0
+ */
object DecisionTree extends Serializable with Logging {
/**
@@ -79,6 +84,7 @@ object DecisionTree extends Serializable with Logging {
* of algorithm (classification, regression, etc.), feature type (continuous,
* categorical), depth of the tree, quantile calculation strategy, etc.
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.0.0
*/
def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = {
new DecisionTree(strategy).run(input)
@@ -100,6 +106,7 @@ object DecisionTree extends Serializable with Logging {
* @param maxDepth Maximum depth of the tree.
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.0.0
*/
def train(
input: RDD[LabeledPoint],
@@ -127,6 +134,7 @@ object DecisionTree extends Serializable with Logging {
* E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
* @param numClasses number of classes for classification. Default value of 2.
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.2.0
*/
def train(
input: RDD[LabeledPoint],
@@ -160,6 +168,7 @@ object DecisionTree extends Serializable with Logging {
* E.g., an entry (n -> k) indicates that feature n is categorical
* with k categories indexed from 0: {0, 1, ..., k-1}.
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.0.0
*/
def train(
input: RDD[LabeledPoint],
@@ -192,6 +201,7 @@ object DecisionTree extends Serializable with Logging {
* @param maxBins maximum number of bins used for splitting features
* (suggested value: 32)
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.1.0
*/
def trainClassifier(
input: RDD[LabeledPoint],
@@ -207,6 +217,7 @@ object DecisionTree extends Serializable with Logging {
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+ * @since 1.1.0
*/
def trainClassifier(
input: JavaRDD[LabeledPoint],
@@ -236,6 +247,7 @@ object DecisionTree extends Serializable with Logging {
* @param maxBins maximum number of bins used for splitting features
* (suggested value: 32)
* @return DecisionTreeModel that can be used for prediction
+ * @since 1.1.0
*/
def trainRegressor(
input: RDD[LabeledPoint],
@@ -249,6 +261,7 @@ object DecisionTree extends Serializable with Logging {
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+ * @since 1.1.0
*/
def trainRegressor(
input: JavaRDD[LabeledPoint],
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index 9ce6faa137..1436170986 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -48,6 +48,7 @@ import org.apache.spark.storage.StorageLevel
* for other loss functions.
*
* @param boostingStrategy Parameters for the gradient boosting algorithm.
+ * @since 1.2.0
*/
@Experimental
class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
@@ -57,6 +58,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
* Method to train a gradient boosting model
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* @return a gradient boosted trees model that can be used for prediction
+ * @since 1.2.0
*/
def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = {
val algo = boostingStrategy.treeStrategy.algo
@@ -74,6 +76,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#run]].
+ * @since 1.2.0
*/
def run(input: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
run(input.rdd)
@@ -88,6 +91,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
* E.g., these two datasets could be created from an original dataset
* by using [[org.apache.spark.rdd.RDD.randomSplit()]]
* @return a gradient boosted trees model that can be used for prediction
+ * @since 1.4.0
*/
def runWithValidation(
input: RDD[LabeledPoint],
@@ -111,6 +115,7 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
+ * @since 1.4.0
*/
def runWithValidation(
input: JavaRDD[LabeledPoint],
@@ -119,6 +124,9 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
}
}
+/**
+ * @since 1.2.0
+ */
object GradientBoostedTrees extends Logging {
/**
@@ -129,6 +137,7 @@ object GradientBoostedTrees extends Logging {
* For regression, labels are real numbers.
* @param boostingStrategy Configuration options for the boosting algorithm.
* @return a gradient boosted trees model that can be used for prediction
+ * @since 1.2.0
*/
def train(
input: RDD[LabeledPoint],
@@ -138,6 +147,7 @@ object GradientBoostedTrees extends Logging {
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees$#train]]
+ * @since 1.2.0
*/
def train(
input: JavaRDD[LabeledPoint],
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 069959976a..9f3230656a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -260,6 +260,9 @@ private class RandomForest (
}
+/**
+ * @since 1.2.0
+ */
object RandomForest extends Serializable with Logging {
/**
@@ -276,6 +279,7 @@ object RandomForest extends Serializable with Logging {
* if numTrees > 1 (forest) set to "sqrt".
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return a random forest model that can be used for prediction
+ * @since 1.2.0
*/
def trainClassifier(
input: RDD[LabeledPoint],
@@ -313,6 +317,7 @@ object RandomForest extends Serializable with Logging {
* (suggested value: 100)
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return a random forest model that can be used for prediction
+ * @since 1.2.0
*/
def trainClassifier(
input: RDD[LabeledPoint],
@@ -332,6 +337,7 @@ object RandomForest extends Serializable with Logging {
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainClassifier]]
+ * @since 1.2.0
*/
def trainClassifier(
input: JavaRDD[LabeledPoint],
@@ -362,6 +368,7 @@ object RandomForest extends Serializable with Logging {
* if numTrees > 1 (forest) set to "onethird".
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return a random forest model that can be used for prediction
+ * @since 1.2.0
*/
def trainRegressor(
input: RDD[LabeledPoint],
@@ -398,6 +405,7 @@ object RandomForest extends Serializable with Logging {
* (suggested value: 100)
* @param seed Random seed for bootstrapping and choosing feature subsets.
* @return a random forest model that can be used for prediction
+ * @since 1.2.0
*/
def trainRegressor(
input: RDD[LabeledPoint],
@@ -416,6 +424,7 @@ object RandomForest extends Serializable with Logging {
/**
* Java-friendly API for [[org.apache.spark.mllib.tree.RandomForest$#trainRegressor]]
+ * @since 1.2.0
*/
def trainRegressor(
input: JavaRDD[LabeledPoint],
@@ -433,6 +442,7 @@ object RandomForest extends Serializable with Logging {
/**
* List of supported feature subset sampling strategies.
+ * @since 1.2.0
*/
val supportedFeatureSubsetStrategies: Array[String] =
Array("auto", "all", "sqrt", "log2", "onethird")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
index b609925997..d9a49aa71f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
@@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental
/**
* :: Experimental ::
* Enum to select the algorithm for the decision tree
+ * @since 1.0.0
*/
@Experimental
object Algo extends Enumeration {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 50fe2ac53d..88e5f57e9a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -38,6 +38,7 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
* validation input between two iterations is less than the validationTol
* then stop. Ignored when
* [[org.apache.spark.mllib.tree.GradientBoostedTrees.run()]] is used.
+ * @since 1.2.0
*/
@Experimental
case class BoostingStrategy(
@@ -70,6 +71,9 @@ case class BoostingStrategy(
}
}
+/**
+ * @since 1.2.0
+ */
@Experimental
object BoostingStrategy {
@@ -77,6 +81,7 @@ object BoostingStrategy {
* Returns default configuration for the boosting algorithm
* @param algo Learning goal. Supported: "Classification" or "Regression"
* @return Configuration for boosting algorithm
+ * @since 1.2.0
*/
def defaultParams(algo: String): BoostingStrategy = {
defaultParams(Algo.fromString(algo))
@@ -88,6 +93,7 @@ object BoostingStrategy {
* [[org.apache.spark.mllib.tree.configuration.Algo.Classification]],
* [[org.apache.spark.mllib.tree.configuration.Algo.Regression]]
* @return Configuration for boosting algorithm
+ * @since 1.3.0
*/
def defaultParams(algo: Algo): BoostingStrategy = {
val treeStrategy = Strategy.defaultStrategy(algo)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
index f4c8772327..0684cafa48 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/FeatureType.scala
@@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental
/**
* :: Experimental ::
* Enum to describe whether a feature is "continuous" or "categorical"
+ * @since 1.0.0
*/
@Experimental
object FeatureType extends Enumeration {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
index 7da976e55a..2daa63c4d2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/QuantileStrategy.scala
@@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental
/**
* :: Experimental ::
* Enum for selecting the quantile calculation strategy
+ * @since 1.0.0
*/
@Experimental
object QuantileStrategy extends Enumeration {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index de2c784809..7ae25a88bf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -66,6 +66,7 @@ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
* E.g. 10 means that the cache will get checkpointed every 10 updates. If
* the checkpoint directory is not set in
* [[org.apache.spark.SparkContext]], this setting is ignored.
+ * @since 1.0.0
*/
@Experimental
class Strategy (
@@ -83,16 +84,23 @@ class Strategy (
@BeanProperty var useNodeIdCache: Boolean = false,
@BeanProperty var checkpointInterval: Int = 10) extends Serializable {
+ /**
+ * @since 1.2.0
+ */
def isMulticlassClassification: Boolean = {
algo == Classification && numClasses > 2
}
+ /**
+ * @since 1.2.0
+ */
def isMulticlassWithCategoricalFeatures: Boolean = {
isMulticlassClassification && (categoricalFeaturesInfo.size > 0)
}
/**
* Java-friendly constructor for [[org.apache.spark.mllib.tree.configuration.Strategy]]
+ * @since 1.1.0
*/
def this(
algo: Algo,
@@ -107,6 +115,7 @@ class Strategy (
/**
* Sets Algorithm using a String.
+ * @since 1.2.0
*/
def setAlgo(algo: String): Unit = algo match {
case "Classification" => setAlgo(Classification)
@@ -115,6 +124,7 @@ class Strategy (
/**
* Sets categoricalFeaturesInfo using a Java Map.
+ * @since 1.2.0
*/
def setCategoricalFeaturesInfo(
categoricalFeaturesInfo: java.util.Map[java.lang.Integer, java.lang.Integer]): Unit = {
@@ -162,7 +172,10 @@ class Strategy (
s"$subsamplingRate")
}
- /** Returns a shallow copy of this instance. */
+ /**
+ * Returns a shallow copy of this instance.
+ * @since 1.2.0
+ */
def copy: Strategy = {
new Strategy(algo, impurity, maxDepth, numClasses, maxBins,
quantileCalculationStrategy, categoricalFeaturesInfo, minInstancesPerNode, minInfoGain,
@@ -170,12 +183,16 @@ class Strategy (
}
}
+/**
+ * @since 1.2.0
+ */
@Experimental
object Strategy {
/**
* Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]]
* @param algo "Classification" or "Regression"
+ * @since 1.2.0
*/
def defaultStrategy(algo: String): Strategy = {
defaultStrategy(Algo.fromString(algo))
@@ -184,6 +201,7 @@ object Strategy {
/**
* Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]]
* @param algo Algo.Classification or Algo.Regression
+ * @since 1.3.0
*/
def defaultStrategy(algo: Algo): Strategy = algo match {
case Algo.Classification =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
index 0768204c33..0b6c7266de 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala
@@ -23,6 +23,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
* :: Experimental ::
* Class for calculating [[http://en.wikipedia.org/wiki/Binary_entropy_function entropy]] during
* binary classification.
+ * @since 1.0.0
*/
@Experimental
object Entropy extends Impurity {
@@ -35,6 +36,7 @@ object Entropy extends Impurity {
* @param counts Array[Double] with counts for each label
* @param totalCount sum of counts for all labels
* @return information value, or 0 if totalCount = 0
+ * @since 1.1.0
*/
@DeveloperApi
override def calculate(counts: Array[Double], totalCount: Double): Double = {
@@ -62,6 +64,7 @@ object Entropy extends Impurity {
* @param sum sum of labels
* @param sumSquares summation of squares of the labels
* @return information value, or 0 if count = 0
+ * @since 1.0.0
*/
@DeveloperApi
override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
@@ -70,6 +73,7 @@ object Entropy extends Impurity {
/**
* Get this impurity instance.
* This is useful for passing impurity parameters to a Strategy in Java.
+ * @since 1.1.0
*/
def instance: this.type = this
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
index d0077db683..3b0be42883 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Gini.scala
@@ -24,6 +24,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
* Class for calculating the
* [[http://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity Gini impurity]]
* during binary classification.
+ * @since 1.0.0
*/
@Experimental
object Gini extends Impurity {
@@ -34,6 +35,7 @@ object Gini extends Impurity {
* @param counts Array[Double] with counts for each label
* @param totalCount sum of counts for all labels
* @return information value, or 0 if totalCount = 0
+ * @since 1.1.0
*/
@DeveloperApi
override def calculate(counts: Array[Double], totalCount: Double): Double = {
@@ -58,6 +60,7 @@ object Gini extends Impurity {
* @param sum sum of labels
* @param sumSquares summation of squares of the labels
* @return information value, or 0 if count = 0
+ * @since 1.0.0
*/
@DeveloperApi
override def calculate(count: Double, sum: Double, sumSquares: Double): Double =
@@ -66,6 +69,7 @@ object Gini extends Impurity {
/**
* Get this impurity instance.
* This is useful for passing impurity parameters to a Strategy in Java.
+ * @since 1.1.0
*/
def instance: this.type = this
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
index 86cee7e430..dd29740005 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Impurity.scala
@@ -25,6 +25,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
* This trait is used for
* (a) setting the impurity parameter in [[org.apache.spark.mllib.tree.configuration.Strategy]]
* (b) calculating impurity values from sufficient statistics.
+ * @since 1.0.0
*/
@Experimental
trait Impurity extends Serializable {
@@ -35,6 +36,7 @@ trait Impurity extends Serializable {
* @param counts Array[Double] with counts for each label
* @param totalCount sum of counts for all labels
* @return information value, or 0 if totalCount = 0
+ * @since 1.1.0
*/
@DeveloperApi
def calculate(counts: Array[Double], totalCount: Double): Double
@@ -46,6 +48,7 @@ trait Impurity extends Serializable {
* @param sum sum of labels
* @param sumSquares summation of squares of the labels
* @return information value, or 0 if count = 0
+ * @since 1.0.0
*/
@DeveloperApi
def calculate(count: Double, sum: Double, sumSquares: Double): Double
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
index 04d0cd24e6..adbe05811f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Variance.scala
@@ -22,6 +22,7 @@ import org.apache.spark.annotation.{DeveloperApi, Experimental}
/**
* :: Experimental ::
* Class for calculating variance during regression
+ * @since 1.0.0
*/
@Experimental
object Variance extends Impurity {
@@ -32,6 +33,7 @@ object Variance extends Impurity {
* @param counts Array[Double] with counts for each label
* @param totalCount sum of counts for all labels
* @return information value, or 0 if totalCount = 0
+ * @since 1.1.0
*/
@DeveloperApi
override def calculate(counts: Array[Double], totalCount: Double): Double =
@@ -44,6 +46,7 @@ object Variance extends Impurity {
* @param sum sum of labels
* @param sumSquares summation of squares of the labels
* @return information value, or 0 if count = 0
+ * @since 1.0.0
*/
@DeveloperApi
override def calculate(count: Double, sum: Double, sumSquares: Double): Double = {
@@ -57,6 +60,7 @@ object Variance extends Impurity {
/**
* Get this impurity instance.
* This is useful for passing impurity parameters to a Strategy in Java.
+ * @since 1.0.0
*/
def instance: this.type = this
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
index 2bdef73c4a..c6e3d0d824 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/AbsoluteError.scala
@@ -29,6 +29,7 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel
* The absolute (L1) error is defined as:
* |y - F(x)|
* where y is the label and F(x) is the model prediction for features x.
+ * @since 1.2.0
*/
@DeveloperApi
object AbsoluteError extends Loss {
@@ -40,6 +41,7 @@ object AbsoluteError extends Loss {
* @param prediction Predicted label.
* @param label True label.
* @return Loss gradient
+ * @since 1.2.0
*/
override def gradient(prediction: Double, label: Double): Double = {
if (label - prediction < 0) 1.0 else -1.0
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
index 778c24526d..eee58445a1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/LogLoss.scala
@@ -31,6 +31,7 @@ import org.apache.spark.mllib.util.MLUtils
* The log loss is defined as:
* 2 log(1 + exp(-2 y F(x)))
* where y is a label in {-1, 1} and F(x) is the model prediction for features x.
+ * @since 1.2.0
*/
@DeveloperApi
object LogLoss extends Loss {
@@ -42,6 +43,7 @@ object LogLoss extends Loss {
* @param prediction Predicted label.
* @param label True label.
* @return Loss gradient
+ * @since 1.2.0
*/
override def gradient(prediction: Double, label: Double): Double = {
- 4.0 * label / (1.0 + math.exp(2.0 * label * prediction))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
index 64ffccbce0..7c9fb92464 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD
/**
* :: DeveloperApi ::
* Trait for adding "pluggable" loss functions for the gradient boosting algorithm.
+ * @since 1.2.0
*/
@DeveloperApi
trait Loss extends Serializable {
@@ -35,6 +36,7 @@ trait Loss extends Serializable {
* @param prediction Predicted feature
* @param label true label.
* @return Loss gradient.
+ * @since 1.2.0
*/
def gradient(prediction: Double, label: Double): Double
@@ -45,6 +47,7 @@ trait Loss extends Serializable {
* @param model Model of the weak learner.
* @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* @return Measure of model error on data
+ * @since 1.2.0
*/
def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = {
data.map(point => computeError(model.predict(point.features), point.label)).mean()
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala
index 42c9ead988..47dc94cde7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Losses.scala
@@ -17,8 +17,14 @@
package org.apache.spark.mllib.tree.loss
+/**
+ * @since 1.2.0
+ */
object Losses {
+ /**
+ * @since 1.2.0
+ */
def fromString(name: String): Loss = name match {
case "leastSquaresError" => SquaredError
case "leastAbsoluteError" => AbsoluteError
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
index 011a5d5742..ff8903d695 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/SquaredError.scala
@@ -29,6 +29,7 @@ import org.apache.spark.mllib.tree.model.TreeEnsembleModel
* The squared (L2) error is defined as:
* (y - F(x))**2
* where y is the label and F(x) is the model prediction for features x.
+ * @since 1.2.0
*/
@DeveloperApi
object SquaredError extends Loss {
@@ -40,6 +41,7 @@ object SquaredError extends Loss {
* @param prediction Predicted label.
* @param label True label.
* @return Loss gradient
+ * @since 1.2.0
*/
override def gradient(prediction: Double, label: Double): Double = {
- 2.0 * (label - prediction)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
index f2c78bbabf..0f386a2660 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala
@@ -40,6 +40,7 @@ import org.apache.spark.util.Utils
* This model stores the decision tree structure and parameters.
* @param topNode root node
* @param algo algorithm type -- classification or regression
+ * @since 1.0.0
*/
@Experimental
class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable with Saveable {
@@ -49,6 +50,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
*
* @param features array representing a single data point
* @return Double prediction from the trained model
+ * @since 1.0.0
*/
def predict(features: Vector): Double = {
topNode.predict(features)
@@ -59,6 +61,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
*
* @param features RDD representing data points to be predicted
* @return RDD of predictions for each of the given data points
+ * @since 1.0.0
*/
def predict(features: RDD[Vector]): RDD[Double] = {
features.map(x => predict(x))
@@ -69,6 +72,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
*
* @param features JavaRDD representing data points to be predicted
* @return JavaRDD of predictions for each of the given data points
+ * @since 1.2.0
*/
def predict(features: JavaRDD[Vector]): JavaRDD[Double] = {
predict(features.rdd)
@@ -76,6 +80,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
/**
* Get number of nodes in tree, including leaf nodes.
+ * @since 1.1.0
*/
def numNodes: Int = {
1 + topNode.numDescendants
@@ -84,6 +89,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
/**
* Get depth of tree.
* E.g.: Depth 0 means 1 leaf node. Depth 1 means 1 internal node and 2 leaf nodes.
+ * @since 1.1.0
*/
def depth: Int = {
topNode.subtreeDepth
@@ -109,6 +115,12 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
header + topNode.subtreeToString(2)
}
+ /**
+ * @param sc Spark context used to save model data.
+ * @param path Path specifying the directory in which to save this model.
+ * If the directory already exists, this method throws an exception.
+ * @since 1.3.0
+ */
override def save(sc: SparkContext, path: String): Unit = {
DecisionTreeModel.SaveLoadV1_0.save(sc, path, this)
}
@@ -116,6 +128,9 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
override protected def formatVersion: String = DecisionTreeModel.formatVersion
}
+/**
+ * @since 1.3.0
+ */
object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
private[spark] def formatVersion: String = "1.0"
@@ -297,6 +312,13 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
}
}
+ /**
+ *
+ * @param sc Spark context used for loading model files.
+ * @param path Path specifying the directory to which the model was saved.
+ * @return Model instance
+ * @since 1.3.0
+ */
override def load(sc: SparkContext, path: String): DecisionTreeModel = {
implicit val formats = DefaultFormats
val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
index 508bf9c1bd..23f0363639 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -29,6 +29,7 @@ import org.apache.spark.mllib.tree.impurity.ImpurityCalculator
* @param rightImpurity right node impurity
* @param leftPredict left node predict
* @param rightPredict right node predict
+ * @since 1.0.0
*/
@DeveloperApi
class InformationGainStats(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
index a6d1398fc2..aca3350c2e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Node.scala
@@ -38,6 +38,7 @@ import org.apache.spark.mllib.linalg.Vector
* @param leftNode left child
* @param rightNode right child
* @param stats information gain stats
+ * @since 1.0.0
*/
@DeveloperApi
class Node (
@@ -58,6 +59,7 @@ class Node (
/**
* build the left node and right nodes if not leaf
* @param nodes array of nodes
+ * @since 1.0.0
*/
@deprecated("build should no longer be used since trees are constructed on-the-fly in training",
"1.2.0")
@@ -79,6 +81,7 @@ class Node (
* predict value if node is not leaf
* @param features feature value
* @return predicted value
+ * @since 1.1.0
*/
def predict(features: Vector) : Double = {
if (isLeaf) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
index 5cbe7c280d..be819b59e7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Predict.scala
@@ -23,6 +23,7 @@ import org.apache.spark.annotation.DeveloperApi
* Predicted value for a node
* @param predict predicted value
* @param prob probability of the label (classification only)
+ * @since 1.2.0
*/
@DeveloperApi
class Predict(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index be6c9b3de5..18d40530ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -30,6 +30,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
* Split left if feature <= threshold, else right.
* @param featureType type of feature -- categorical or continuous
* @param categories Split left if categorical feature value is in this set, else right.
+ * @since 1.0.0
*/
@DeveloperApi
case class Split(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 905c5fb42b..0c629b12a8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -45,6 +45,7 @@ import org.apache.spark.util.Utils
*
* @param algo algorithm for the ensemble model, either Classification or Regression
* @param trees tree ensembles
+ * @since 1.2.0
*/
@Experimental
class RandomForestModel(override val algo: Algo, override val trees: Array[DecisionTreeModel])
@@ -54,6 +55,13 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis
require(trees.forall(_.algo == algo))
+ /**
+ *
+ * @param sc Spark context used to save model data.
+ * @param path Path specifying the directory in which to save this model.
+ * If the directory already exists, this method throws an exception.
+ * @since 1.3.0
+ */
override def save(sc: SparkContext, path: String): Unit = {
TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this,
RandomForestModel.SaveLoadV1_0.thisClassName)
@@ -62,10 +70,20 @@ class RandomForestModel(override val algo: Algo, override val trees: Array[Decis
override protected def formatVersion: String = RandomForestModel.formatVersion
}
+/**
+ * @since 1.3.0
+ */
object RandomForestModel extends Loader[RandomForestModel] {
private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion
+ /**
+ *
+ * @param sc Spark context used for loading model files.
+ * @param path Path specifying the directory to which the model was saved.
+ * @return Model instance
+ * @since 1.3.0
+ */
override def load(sc: SparkContext, path: String): RandomForestModel = {
val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
val classNameV1_0 = SaveLoadV1_0.thisClassName
@@ -96,6 +114,7 @@ object RandomForestModel extends Loader[RandomForestModel] {
* @param algo algorithm for the ensemble model, either Classification or Regression
* @param trees tree ensembles
* @param treeWeights tree ensemble weights
+ * @since 1.2.0
*/
@Experimental
class GradientBoostedTreesModel(
@@ -107,6 +126,12 @@ class GradientBoostedTreesModel(
require(trees.length == treeWeights.length)
+ /**
+ * @param sc Spark context used to save model data.
+ * @param path Path specifying the directory in which to save this model.
+ * If the directory already exists, this method throws an exception.
+ * @since 1.3.0
+ */
override def save(sc: SparkContext, path: String): Unit = {
TreeEnsembleModel.SaveLoadV1_0.save(sc, path, this,
GradientBoostedTreesModel.SaveLoadV1_0.thisClassName)
@@ -118,6 +143,7 @@ class GradientBoostedTreesModel(
* @param loss evaluation metric.
* @return an array with index i having the losses or errors for the ensemble
* containing the first i+1 trees
+ * @since 1.4.0
*/
def evaluateEachIteration(
data: RDD[LabeledPoint],
@@ -159,6 +185,9 @@ class GradientBoostedTreesModel(
override protected def formatVersion: String = GradientBoostedTreesModel.formatVersion
}
+/**
+ * @since 1.3.0
+ */
object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
/**
@@ -170,6 +199,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
* @param loss: evaluation metric.
* @return a RDD with each element being a zip of the prediction and error
* corresponding to every sample.
+ * @since 1.4.0
*/
def computeInitialPredictionAndError(
data: RDD[LabeledPoint],
@@ -193,6 +223,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
* @param loss: evaluation metric.
* @return a RDD with each element being a zip of the prediction and error
* corresponding to each sample.
+ * @since 1.4.0
*/
def updatePredictionError(
data: RDD[LabeledPoint],
@@ -213,6 +244,12 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
private[mllib] def formatVersion: String = TreeEnsembleModel.SaveLoadV1_0.thisFormatVersion
+ /**
+ * @param sc Spark context used for loading model files.
+ * @param path Path specifying the directory to which the model was saved.
+ * @return Model instance
+ * @since 1.3.0
+ */
override def load(sc: SparkContext, path: String): GradientBoostedTreesModel = {
val (loadedClassName, version, jsonMetadata) = Loader.loadMetadata(sc, path)
val classNameV1_0 = SaveLoadV1_0.thisClassName
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala
index bcaacc1b1f..f520b3a1b7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/package.scala
@@ -24,6 +24,7 @@ package org.apache.spark.mllib
* - information loss calculation with entropy and Gini for classification and
* variance for regression,
* - both continuous and categorical features.
+ * @since 1.0.0
*/
package object tree {
}