aboutsummaryrefslogtreecommitdiff
path: root/mllib/src
diff options
context:
space:
mode:
authorhyukjinkwon <gurwls223@gmail.com>2016-11-19 11:24:15 +0000
committerSean Owen <sowen@cloudera.com>2016-11-19 11:24:15 +0000
commitd5b1d5fc80153571c308130833d0c0774de62c92 (patch)
treea194c154699e2edb55c146232c66251d4ac77f18 /mllib/src
parentdb9fb9baacbf8640dd37a507b7450db727c7e6ea (diff)
downloadspark-d5b1d5fc80153571c308130833d0c0774de62c92.tar.gz
spark-d5b1d5fc80153571c308130833d0c0774de62c92.tar.bz2
spark-d5b1d5fc80153571c308130833d0c0774de62c92.zip
[SPARK-18445][BUILD][DOCS] Fix the markdown for `Note:`/`NOTE:`/`Note that`/`'''Note:'''` across Scala/Java API documentation
## What changes were proposed in this pull request? It seems in Scala/Java, - `Note:` - `NOTE:` - `Note that` - `'''Note:'''` - `note` This PR proposes to fix those to `note` to be consistent. **Before** - Scala ![2016-11-17 6 16 39](https://cloud.githubusercontent.com/assets/6477701/20383180/1a7aed8c-acf2-11e6-9611-5eaf6d52c2e0.png) - Java ![2016-11-17 6 14 41](https://cloud.githubusercontent.com/assets/6477701/20383096/c8ffc680-acf1-11e6-914a-33460bf1401d.png) **After** - Scala ![2016-11-17 6 16 44](https://cloud.githubusercontent.com/assets/6477701/20383167/09940490-acf2-11e6-937a-0d5e1dc2cadf.png) - Java ![2016-11-17 6 13 39](https://cloud.githubusercontent.com/assets/6477701/20383132/e7c2a57e-acf1-11e6-9c47-b849674d4d88.png) ## How was this patch tested? The notes were found via ```bash grep -r "NOTE: " . | \ # Note:|NOTE:|Note that|'''Note:''' grep -v "// NOTE: " | \ # starting with // does not appear in API documentation. grep -E '.scala|.java' | \ # java/scala files grep -v Suite | \ # exclude tests grep -v Test | \ # exclude tests grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation -e 'org.apache.spark.api.java.function' \ # note that this is a regular expression. So actual matches were mostly `org/apache/spark/api/java/functions ...` -e 'org.apache.spark.api.r' \ ... ``` ```bash grep -r "Note that " . | \ # Note:|NOTE:|Note that|'''Note:''' grep -v "// Note that " | \ # starting with // does not appear in API documentation. grep -E '.scala|.java' | \ # java/scala files grep -v Suite | \ # exclude tests grep -v Test | \ # exclude tests grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation -e 'org.apache.spark.api.java.function' \ -e 'org.apache.spark.api.r' \ ... ``` ```bash grep -r "Note: " . | \ # Note:|NOTE:|Note that|'''Note:''' grep -v "// Note: " | \ # starting with // does not appear in API documentation. grep -E '.scala|.java' | \ # java/scala files grep -v Suite | \ # exclude tests grep -v Test | \ # exclude tests grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation -e 'org.apache.spark.api.java.function' \ -e 'org.apache.spark.api.r' \ ... ``` ```bash grep -r "'''Note:'''" . | \ # Note:|NOTE:|Note that|'''Note:''' grep -v "// '''Note:''' " | \ # starting with // does not appear in API documentation. grep -E '.scala|.java' | \ # java/scala files grep -v Suite | \ # exclude tests grep -v Test | \ # exclude tests grep -e 'org.apache.spark.api.java' \ # packages appear in API documenation -e 'org.apache.spark.api.java.function' \ -e 'org.apache.spark.api.r' \ ... ``` And then fixed one by one comparing with API documentation/access modifiers. After that, manually tested via `jekyll build`. Author: hyukjinkwon <gurwls223@gmail.com> Closes #15889 from HyukjinKwon/SPARK-18437.
Diffstat (limited to 'mllib/src')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Model.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala36
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala5
-rwxr-xr-xmllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/param/params.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala28
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala28
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala20
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala21
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala34
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala32
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala4
36 files changed, 185 insertions, 146 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
index 252acc1565..c581fed177 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala
@@ -30,7 +30,7 @@ import org.apache.spark.ml.param.ParamMap
abstract class Model[M <: Model[M]] extends Transformer {
/**
* The parent estimator that produced this model.
- * Note: For ensembles' component Models, this value can be null.
+ * @note For ensembles' component Models, this value can be null.
*/
@transient var parent: Estimator[M] = _
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index bb192ab5f2..7424031ed4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -207,9 +207,9 @@ class DecisionTreeClassificationModel private[ml] (
* where gain is scaled by the number of instances passing through node
* - Normalize importances for tree to sum to 1.
*
- * Note: Feature importance for single decision trees can have high variance due to
- * correlated predictor variables. Consider using a [[RandomForestClassifier]]
- * to determine feature importance instead.
+ * @note Feature importance for single decision trees can have high variance due to
+ * correlated predictor variables. Consider using a [[RandomForestClassifier]]
+ * to determine feature importance instead.
*/
@Since("2.0.0")
lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(this, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index f8f164e8c1..52f93f5a6b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -43,7 +43,6 @@ import org.apache.spark.sql.types.DoubleType
* Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting)
* learning algorithm for classification.
* It supports binary labels, as well as both continuous and categorical features.
- * Note: Multiclass labels are not currently supported.
*
* The implementation is based upon: J.H. Friedman. "Stochastic Gradient Boosting." 1999.
*
@@ -54,6 +53,8 @@ import org.apache.spark.sql.types.DoubleType
* based on the loss function, whereas the original gradient boosting method does not.
* - We expect to implement TreeBoost in the future:
* [https://issues.apache.org/jira/browse/SPARK-4240]
+ *
+ * @note Multiclass labels are not currently supported.
*/
@Since("1.4.0")
class GBTClassifier @Since("1.4.0") (
@@ -169,10 +170,11 @@ object GBTClassifier extends DefaultParamsReadable[GBTClassifier] {
* Gradient-Boosted Trees (GBTs) (http://en.wikipedia.org/wiki/Gradient_boosting)
* model for classification.
* It supports binary labels, as well as both continuous and categorical features.
- * Note: Multiclass labels are not currently supported.
*
* @param _trees Decision trees in the ensemble.
* @param _treeWeights Weights for the decision trees in the ensemble.
+ *
+ * @note Multiclass labels are not currently supported.
*/
@Since("1.6.0")
class GBTClassificationModel private[ml](
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 18b9b3043d..71a7fe53c1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -1191,8 +1191,8 @@ class BinaryLogisticRegressionSummary private[classification] (
* with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
* See http://en.wikipedia.org/wiki/Receiver_operating_characteristic
*
- * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
@transient lazy val roc: DataFrame = binaryMetrics.roc().toDF("FPR", "TPR")
@@ -1200,8 +1200,8 @@ class BinaryLogisticRegressionSummary private[classification] (
/**
* Computes the area under the receiver operating characteristic (ROC) curve.
*
- * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
lazy val areaUnderROC: Double = binaryMetrics.areaUnderROC()
@@ -1210,8 +1210,8 @@ class BinaryLogisticRegressionSummary private[classification] (
* Returns the precision-recall curve, which is a Dataframe containing
* two fields recall, precision with (0.0, 1.0) prepended to it.
*
- * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
@transient lazy val pr: DataFrame = binaryMetrics.pr().toDF("recall", "precision")
@@ -1219,8 +1219,8 @@ class BinaryLogisticRegressionSummary private[classification] (
/**
* Returns a dataframe with two fields (threshold, F-Measure) curve with beta = 1.0.
*
- * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
@transient lazy val fMeasureByThreshold: DataFrame = {
@@ -1232,8 +1232,8 @@ class BinaryLogisticRegressionSummary private[classification] (
* Every possible probability obtained in transforming the dataset are used
* as thresholds used in calculating the precision.
*
- * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
@transient lazy val precisionByThreshold: DataFrame = {
@@ -1245,8 +1245,8 @@ class BinaryLogisticRegressionSummary private[classification] (
* Every possible probability obtained in transforming the dataset are used
* as thresholds used in calculating the recall.
*
- * Note: This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from `LogisticRegression.weightCol`.
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
@transient lazy val recallByThreshold: DataFrame = {
@@ -1401,18 +1401,18 @@ class BinaryLogisticRegressionSummary private[classification] (
* $$
* </blockquote></p>
*
- * @note In order to avoid unnecessary computation during calculation of the gradient updates
- * we lay out the coefficients in column major order during training. This allows us to
- * perform feature standardization once, while still retaining sequential memory access
- * for speed. We convert back to row major order when we create the model,
- * since this form is optimal for the matrix operations used for prediction.
- *
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param bcFeaturesStd The broadcast standard deviation values of the features.
* @param numClasses the number of possible outcomes for k classes classification problem in
* Multinomial Logistic Regression.
* @param fitIntercept Whether to fit an intercept term.
* @param multinomial Whether to use multinomial (softmax) or binary loss
+ *
+ * @note In order to avoid unnecessary computation during calculation of the gradient updates
+ * we lay out the coefficients in column major order during training. This allows us to
+ * perform feature standardization once, while still retaining sequential memory access
+ * for speed. We convert back to row major order when we create the model,
+ * since this form is optimal for the matrix operations used for prediction.
*/
private class LogisticAggregator(
bcCoefficients: Broadcast[Vector],
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index a0bd66e731..c6035cc4c9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -268,9 +268,9 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
* While this process is generally guaranteed to converge, it is not guaranteed
* to find a global optimum.
*
- * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
- * This is due to high-dimensional data (a) making it difficult to cluster at all (based
- * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
+ * @note For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
*/
@Since("2.0.0")
@Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index 28cbe1cb01..ccfb0ce8f8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -85,7 +85,8 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
* </blockquote></p>
*
* For the case $E_{max} == E_{min}$, $Rescaled(e_i) = 0.5 * (max + min)$.
- * Note that since zero values will probably be transformed to non-zero values, output of the
+ *
+ * @note Since zero values will probably be transformed to non-zero values, output of the
* transformer will be DenseVector even for sparse input.
*/
@Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index e8e28ba29c..ea401216ae 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -36,7 +36,8 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
* The last category is not included by default (configurable via [[OneHotEncoder!.dropLast]]
* because it makes the vector entries sum up to one, and hence linearly dependent.
* So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
- * Note that this is different from scikit-learn's OneHotEncoder, which keeps all categories.
+ *
+ * @note This is different from scikit-learn's OneHotEncoder, which keeps all categories.
* The output vectors are sparse.
*
* @see [[StringIndexer]] for converting categorical values into category indices
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 1e49352b85..6e08bf0591 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -142,8 +142,9 @@ class PCAModel private[ml] (
/**
* Transform a vector by computed Principal Components.
- * NOTE: Vectors to be transformed must be the same length
- * as the source vectors given to [[PCA.fit()]].
+ *
+ * @note Vectors to be transformed must be the same length as the source vectors given
+ * to [[PCA.fit()]].
*/
@Since("2.0.0")
override def transform(dataset: Dataset[_]): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 666070037c..0ced21365f 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -28,7 +28,10 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
/**
* A feature transformer that filters out stop words from input.
- * Note: null values from input array are preserved unless adding null to stopWords explicitly.
+ *
+ * @note null values from input array are preserved unless adding null to stopWords
+ * explicitly.
+ *
* @see [[http://en.wikipedia.org/wiki/Stop_words]]
*/
@Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 80fe46796f..8b155f0001 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -113,11 +113,11 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] {
/**
* Model fitted by [[StringIndexer]].
*
- * NOTE: During transformation, if the input column does not exist,
+ * @param labels Ordered list of labels, corresponding to indices to be assigned.
+ *
+ * @note During transformation, if the input column does not exist,
* [[StringIndexerModel.transform]] would return the input dataset unmodified.
* This is a temporary fix for the case when target labels do not exist during prediction.
- *
- * @param labels Ordered list of labels, corresponding to indices to be assigned.
*/
@Since("1.4.0")
class StringIndexerModel (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 9245931b27..96206e0b7a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -533,7 +533,7 @@ trait Params extends Identifiable with Serializable {
* Returns all params sorted by their names. The default implementation uses Java reflection to
* list all public methods that have no arguments and return [[Param]].
*
- * Note: Developer should not use this method in constructor because we cannot guarantee that
+ * @note Developer should not use this method in constructor because we cannot guarantee that
* this variable gets initialized before other params.
*/
lazy val params: Array[Param[_]] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index ebc6c12ddc..1419da8747 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -207,9 +207,9 @@ class DecisionTreeRegressionModel private[ml] (
* where gain is scaled by the number of instances passing through node
* - Normalize importances for tree to sum to 1.
*
- * Note: Feature importance for single decision trees can have high variance due to
- * correlated predictor variables. Consider using a [[RandomForestRegressor]]
- * to determine feature importance instead.
+ * @note Feature importance for single decision trees can have high variance due to
+ * correlated predictor variables. Consider using a [[RandomForestRegressor]]
+ * to determine feature importance instead.
*/
@Since("2.0.0")
lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(this, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 1d2961e027..736fd3b9e0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -879,8 +879,8 @@ class GeneralizedLinearRegressionSummary private[regression] (
* Private copy of model to ensure Params are not modified outside this class.
* Coefficients is not a deep copy, but that is acceptable.
*
- * NOTE: [[predictionCol]] must be set correctly before the value of [[model]] is set,
- * and [[model]] must be set before [[predictions]] is set!
+ * @note [[predictionCol]] must be set correctly before the value of [[model]] is set,
+ * and [[model]] must be set before [[predictions]] is set!
*/
protected val model: GeneralizedLinearRegressionModel =
origModel.copy(ParamMap.empty).setPredictionCol(predictionCol)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 71c542adf6..da7ce6b46f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -103,11 +103,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
/**
* Whether to standardize the training features before fitting the model.
* The coefficients of models will be always returned on the original scale,
- * so it will be transparent for users. Note that with/without standardization,
- * the models should be always converged to the same solution when no regularization
- * is applied. In R's GLMNET package, the default behavior is true as well.
+ * so it will be transparent for users.
* Default is true.
*
+ * @note With/without standardization, the models should be always converged
+ * to the same solution when no regularization is applied. In R's GLMNET package,
+ * the default behavior is true as well.
+ *
* @group setParam
*/
@Since("1.5.0")
@@ -624,8 +626,8 @@ class LinearRegressionSummary private[regression] (
* explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
* Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
*
- * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
val explainedVariance: Double = metrics.explainedVariance
@@ -634,8 +636,8 @@ class LinearRegressionSummary private[regression] (
* Returns the mean absolute error, which is a risk function corresponding to the
* expected value of the absolute error loss or l1-norm loss.
*
- * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
val meanAbsoluteError: Double = metrics.meanAbsoluteError
@@ -644,8 +646,8 @@ class LinearRegressionSummary private[regression] (
* Returns the mean squared error, which is a risk function corresponding to the
* expected value of the squared error loss or quadratic loss.
*
- * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
val meanSquaredError: Double = metrics.meanSquaredError
@@ -654,8 +656,8 @@ class LinearRegressionSummary private[regression] (
* Returns the root mean squared error, which is defined as the square root of
* the mean squared error.
*
- * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
val rootMeanSquaredError: Double = metrics.rootMeanSquaredError
@@ -664,8 +666,8 @@ class LinearRegressionSummary private[regression] (
* Returns R^2^, the coefficient of determination.
* Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
*
- * Note: This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
- * This will change in later Spark versions.
+ * @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
+ * This will change in later Spark versions.
*/
@Since("1.5.0")
val r2: Double = metrics.r2
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
index 73d813064d..e137692703 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
@@ -48,7 +48,7 @@ import org.apache.spark.sql.{DataFrame, DataFrameReader}
* inconsistent feature dimensions.
* - "vectorType": feature vector type, "sparse" (default) or "dense".
*
- * Note that this class is public for documentation purpose. Please don't use this class directly.
+ * @note This class is public for documentation purpose. Please don't use this class directly.
* Rather, use the data source API as illustrated above.
*
* @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index ede0a060ee..0a0bc4c006 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -98,7 +98,7 @@ private[spark] object GradientBoostedTrees extends Logging {
* @param initTreeWeight: learning rate assigned to the first tree.
* @param initTree: first DecisionTreeModel.
* @param loss: evaluation metric.
- * @return a RDD with each element being a zip of the prediction and error
+ * @return an RDD with each element being a zip of the prediction and error
* corresponding to every sample.
*/
def computeInitialPredictionAndError(
@@ -121,7 +121,7 @@ private[spark] object GradientBoostedTrees extends Logging {
* @param treeWeight: Learning rate.
* @param tree: Tree using which the prediction and error should be updated.
* @param loss: evaluation metric.
- * @return a RDD with each element being a zip of the prediction and error
+ * @return an RDD with each element being a zip of the prediction and error
* corresponding to each sample.
*/
def updatePredictionError(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index bc4f9e6716..e5fa5d53e3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -221,7 +221,7 @@ trait MLReadable[T] {
/**
* Reads an ML instance from the input path, a shortcut of `read.load(path)`.
*
- * Note: Implementing classes should override this to be Java-friendly.
+ * @note Implementing classes should override this to be Java-friendly.
*/
@Since("1.6.0")
def load(path: String): T = read.load(path)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index d851b98334..4b65000073 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -202,9 +202,11 @@ object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
* Train a classification model for Binary Logistic Regression
* using Stochastic Gradient Descent. By default L2 regularization is used,
* which can be changed via `LogisticRegressionWithSGD.optimizer`.
- * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
- * for k classes multi-label classification problem.
+ *
* Using [[LogisticRegressionWithLBFGS]] is recommended over this.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1, ..., k - 1}
+ * for k classes multi-label classification problem.
*/
@Since("0.8.0")
class LogisticRegressionWithSGD private[mllib] (
@@ -239,7 +241,8 @@ class LogisticRegressionWithSGD private[mllib] (
/**
* Top-level methods for calling Logistic Regression using Stochastic Gradient Descent.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
*/
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
@@ -252,7 +255,6 @@ object LogisticRegressionWithSGD {
* number of iterations of gradient descent using the specified step size. Each iteration uses
* `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
* gradient descent are initialized using the initial weights provided.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
@@ -260,6 +262,8 @@ object LogisticRegressionWithSGD {
* @param miniBatchFraction Fraction of data to be used per iteration.
* @param initialWeights Initial set of weights to be used. Array should be equal in size to
* the number of features in the data.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
*/
@Since("1.0.0")
def train(
@@ -276,13 +280,13 @@ object LogisticRegressionWithSGD {
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
* number of iterations of gradient descent using the specified step size. Each iteration uses
* `miniBatchFraction` fraction of the data to calculate the gradient.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @param stepSize Step size to be used for each iteration of gradient descent.
-
* @param miniBatchFraction Fraction of data to be used per iteration.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
*/
@Since("1.0.0")
def train(
@@ -298,13 +302,13 @@ object LogisticRegressionWithSGD {
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
* number of iterations of gradient descent using the specified step size. We use the entire data
* set to update the gradient in each iteration.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param stepSize Step size to be used for each iteration of Gradient Descent.
-
* @param numIterations Number of iterations of gradient descent to run.
* @return a LogisticRegressionModel which has the weights and offset from training.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
*/
@Since("1.0.0")
def train(
@@ -318,11 +322,12 @@ object LogisticRegressionWithSGD {
* Train a logistic regression model given an RDD of (label, features) pairs. We run a fixed
* number of iterations of gradient descent using a step size of 1.0. We use the entire data set
* to update the gradient in each iteration.
- * NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @return a LogisticRegressionModel which has the weights and offset from training.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1}
*/
@Since("1.0.0")
def train(
@@ -335,8 +340,6 @@ object LogisticRegressionWithSGD {
/**
* Train a classification model for Multinomial/Binary Logistic Regression using
* Limited-memory BFGS. Standard feature scaling and L2 regularization are used by default.
- * NOTE: Labels used in Logistic Regression should be {0, 1, ..., k - 1}
- * for k classes multi-label classification problem.
*
* Earlier implementations of LogisticRegressionWithLBFGS applies a regularization
* penalty to all elements including the intercept. If this is called with one of
@@ -344,6 +347,9 @@ object LogisticRegressionWithSGD {
* into a call to ml.LogisticRegression, otherwise this will use the existing mllib
* GeneralizedLinearAlgorithm trainer, resulting in a regularization penalty to the
* intercept.
+ *
+ * @note Labels used in Logistic Regression should be {0, 1, ..., k - 1}
+ * for k classes multi-label classification problem.
*/
@Since("1.1.0")
class LogisticRegressionWithLBFGS
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index 7c3ccbb40b..aec1526b55 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -125,7 +125,8 @@ object SVMModel extends Loader[SVMModel] {
/**
* Train a Support Vector Machine (SVM) using Stochastic Gradient Descent. By default L2
* regularization is used, which can be changed via [[SVMWithSGD.optimizer]].
- * NOTE: Labels used in SVM should be {0, 1}.
+ *
+ * @note Labels used in SVM should be {0, 1}.
*/
@Since("0.8.0")
class SVMWithSGD private (
@@ -158,7 +159,9 @@ class SVMWithSGD private (
}
/**
- * Top-level methods for calling SVM. NOTE: Labels used in SVM should be {0, 1}.
+ * Top-level methods for calling SVM.
+ *
+ * @note Labels used in SVM should be {0, 1}.
*/
@Since("0.8.0")
object SVMWithSGD {
@@ -169,8 +172,6 @@ object SVMWithSGD {
* `miniBatchFraction` fraction of the data to calculate the gradient. The weights used in
* gradient descent are initialized using the initial weights provided.
*
- * NOTE: Labels used in SVM should be {0, 1}.
- *
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @param stepSize Step size to be used for each iteration of gradient descent.
@@ -178,6 +179,8 @@ object SVMWithSGD {
* @param miniBatchFraction Fraction of data to be used per iteration.
* @param initialWeights Initial set of weights to be used. Array should be equal in size to
* the number of features in the data.
+ *
+ * @note Labels used in SVM should be {0, 1}.
*/
@Since("0.8.0")
def train(
@@ -195,7 +198,8 @@ object SVMWithSGD {
* Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
* of iterations of gradient descent using the specified step size. Each iteration uses
* `miniBatchFraction` fraction of the data to calculate the gradient.
- * NOTE: Labels used in SVM should be {0, 1}
+ *
+ * @note Labels used in SVM should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
@@ -217,13 +221,14 @@ object SVMWithSGD {
* Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
* of iterations of gradient descent using the specified step size. We use the entire data set to
* update the gradient in each iteration.
- * NOTE: Labels used in SVM should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param stepSize Step size to be used for each iteration of Gradient Descent.
* @param regParam Regularization parameter.
* @param numIterations Number of iterations of gradient descent to run.
* @return a SVMModel which has the weights and offset from training.
+ *
+ * @note Labels used in SVM should be {0, 1}
*/
@Since("0.8.0")
def train(
@@ -238,11 +243,12 @@ object SVMWithSGD {
* Train a SVM model given an RDD of (label, features) pairs. We run a fixed number
* of iterations of gradient descent using a step size of 1.0. We use the entire data set to
* update the gradient in each iteration.
- * NOTE: Labels used in SVM should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @return a SVMModel which has the weights and offset from training.
+ *
+ * @note Labels used in SVM should be {0, 1}
*/
@Since("0.8.0")
def train(input: RDD[LabeledPoint], numIterations: Int): SVMModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 43193adf3e..56cdeea5f7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -41,14 +41,14 @@ import org.apache.spark.util.Utils
* While this process is generally guaranteed to converge, it is not guaranteed
* to find a global optimum.
*
- * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
- * This is due to high-dimensional data (a) making it difficult to cluster at all (based
- * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
- *
* @param k Number of independent Gaussians in the mixture model.
* @param convergenceTol Maximum change in log-likelihood at which convergence
* is considered to have occurred.
* @param maxIterations Maximum number of iterations allowed.
+ *
+ * @note For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
*/
@Since("1.3.0")
class GaussianMixture private (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ed9c064879..fa72b72e2d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -56,14 +56,18 @@ class KMeans private (
def this() = this(2, 20, KMeans.K_MEANS_PARALLEL, 2, 1e-4, Utils.random.nextLong())
/**
- * Number of clusters to create (k). Note that it is possible for fewer than k clusters to
+ * Number of clusters to create (k).
+ *
+ * @note It is possible for fewer than k clusters to
* be returned, for example, if there are fewer than k distinct points to cluster.
*/
@Since("1.4.0")
def getK: Int = k
/**
- * Set the number of clusters to create (k). Note that it is possible for fewer than k clusters to
+ * Set the number of clusters to create (k).
+ *
+ * @note It is possible for fewer than k clusters to
* be returned, for example, if there are fewer than k distinct points to cluster. Default: 2.
*/
@Since("0.8.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index d999b9be8e..7c52abdeaa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -175,7 +175,7 @@ class LDA private (
*
* This is the parameter to a symmetric Dirichlet distribution.
*
- * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+ * @note The topics' distributions over terms are called "beta" in the original LDA paper
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
*/
@Since("1.3.0")
@@ -187,7 +187,7 @@ class LDA private (
*
* This is the parameter to a symmetric Dirichlet distribution.
*
- * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+ * @note The topics' distributions over terms are called "beta" in the original LDA paper
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
*
* If set to -1, then topicConcentration is set automatically.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 90d8a558f1..b5b0e64a2a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -66,7 +66,7 @@ abstract class LDAModel private[clustering] extends Saveable {
*
* This is the parameter to a symmetric Dirichlet distribution.
*
- * Note: The topics' distributions over terms are called "beta" in the original LDA paper
+ * @note The topics' distributions over terms are called "beta" in the original LDA paper
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
*/
@Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index ae324f86fe..7365ea1f20 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -93,9 +93,11 @@ final class EMLDAOptimizer extends LDAOptimizer {
/**
* If using checkpointing, this indicates whether to keep the last checkpoint (vs clean up).
* Deleting the checkpoint can cause failures if a data partition is lost, so set this bit with
- * care. Note that checkpoints will be cleaned up via reference counting, regardless.
+ * care.
*
* Default: true
+ *
+ * @note Checkpoints will be cleaned up via reference counting, regardless.
*/
@Since("2.0.0")
def setKeepLastCheckpoint(keepLastCheckpoint: Boolean): this.type = {
@@ -348,7 +350,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
* Mini-batch fraction in (0, 1], which sets the fraction of document sampled and used in
* each iteration.
*
- * Note that this should be adjusted in synch with [[LDA.setMaxIterations()]]
+ * @note This should be adjusted in synch with [[LDA.setMaxIterations()]]
* so the entire corpus is used. Specifically, set both so that
* maxIterations * miniBatchFraction >= 1.
*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
index f0779491e6..003d1411a9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/AreaUnderCurve.scala
@@ -39,7 +39,7 @@ private[evaluation] object AreaUnderCurve {
/**
* Returns the area under the given curve.
*
- * @param curve a RDD of ordered 2D points stored in pairs representing a curve
+ * @param curve an RDD of ordered 2D points stored in pairs representing a curve
*/
def of(curve: RDD[(Double, Double)]): Double = {
curve.sliding(2).aggregate(0.0)(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index fbd217af74..c94d7890cf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types._
/**
* Represents a numeric vector, whose index type is Int and value type is Double.
*
- * Note: Users should not implement this interface.
+ * @note Users should not implement this interface.
*/
@SQLUserDefinedType(udt = classOf[VectorUDT])
@Since("1.0.0")
@@ -132,7 +132,9 @@ sealed trait Vector extends Serializable {
/**
* Number of active entries. An "active entry" is an element which is explicitly stored,
- * regardless of its value. Note that inactive entries have value 0.
+ * regardless of its value.
+ *
+ * @note Inactive entries have value 0.
*/
@Since("1.4.0")
def numActives: Int
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 377be6bfb9..03866753b5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -451,7 +451,7 @@ class BlockMatrix @Since("1.3.0") (
* [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
* some performance issues until support for multiplying two sparse matrices is added.
*
- * Note: The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
+ * @note The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
* there were blocks with duplicate indices. Now, the blocks with duplicate indices will be added
* with each other.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index b03b3ecde9..809906a158 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -188,8 +188,9 @@ class IndexedRowMatrix @Since("1.0.0") (
}
/**
- * Computes the Gramian matrix `A^T A`. Note that this cannot be
- * computed on matrices with more than 65535 columns.
+ * Computes the Gramian matrix `A^T A`.
+ *
+ * @note This cannot be computed on matrices with more than 65535 columns.
*/
@Since("1.0.0")
def computeGramianMatrix(): Matrix = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index ec32e37afb..4b120332ab 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -106,8 +106,9 @@ class RowMatrix @Since("1.0.0") (
}
/**
- * Computes the Gramian matrix `A^T A`. Note that this cannot be computed on matrices with
- * more than 65535 columns.
+ * Computes the Gramian matrix `A^T A`.
+ *
+ * @note This cannot be computed on matrices with more than 65535 columns.
*/
@Since("1.0.0")
def computeGramianMatrix(): Matrix = {
@@ -168,9 +169,6 @@ class RowMatrix @Since("1.0.0") (
* ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK's
* eigen-decomposition is set to 1e-10.
*
- * @note The conditions that decide which method to use internally and the default parameters are
- * subject to change.
- *
* @param k number of leading singular values to keep (0 &lt; k &lt;= n).
* It might return less than k if
* there are numerically zero singular values or there are not enough Ritz values
@@ -180,6 +178,9 @@ class RowMatrix @Since("1.0.0") (
* @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
* are treated as zero, where sigma(0) is the largest singular value.
* @return SingularValueDecomposition(U, s, V). U = null if computeU = false.
+ *
+ * @note The conditions that decide which method to use internally and the default parameters are
+ * subject to change.
*/
@Since("1.0.0")
def computeSVD(
@@ -319,9 +320,11 @@ class RowMatrix @Since("1.0.0") (
}
/**
- * Computes the covariance matrix, treating each row as an observation. Note that this cannot
- * be computed on matrices with more than 65535 columns.
+ * Computes the covariance matrix, treating each row as an observation.
+ *
* @return a local dense matrix of size n x n
+ *
+ * @note This cannot be computed on matrices with more than 65535 columns.
*/
@Since("1.0.0")
def computeCovariance(): Matrix = {
@@ -369,12 +372,12 @@ class RowMatrix @Since("1.0.0") (
* The row data do not need to be "centered" first; it is not necessary for
* the mean of each column to be 0.
*
- * Note that this cannot be computed on matrices with more than 65535 columns.
- *
* @param k number of top principal components.
* @return a matrix of size n-by-k, whose columns are principal components, and
* a vector of values which indicate how much variance each principal component
* explains
+ *
+ * @note This cannot be computed on matrices with more than 65535 columns.
*/
@Since("1.6.0")
def computePrincipalComponentsAndExplainedVariance(k: Int): (Matrix, Vector) = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index 81e64de4e5..c49e72646b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -305,7 +305,8 @@ class LeastSquaresGradient extends Gradient {
* :: DeveloperApi ::
* Compute gradient and loss for a Hinge loss function, as used in SVM binary classification.
* See also the documentation for the precise formulation.
- * NOTE: This assumes that the labels are {0,1}
+ *
+ * @note This assumes that the labels are {0,1}
*/
@DeveloperApi
class HingeGradient extends Gradient {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
index 0f7857b8d8..005119616f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -31,7 +31,7 @@ import org.apache.spark.rdd.RDD
class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable {
/**
- * Returns a RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
+ * Returns an RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
* window over them. The ordering is first based on the partition index and then the ordering of
* items within each partition. This is similar to sliding in Scala collections, except that it
* becomes an empty RDD if the window size is greater than the total number of items. It needs to
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index c642573ccb..24e4dcccc8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -43,14 +43,14 @@ import org.apache.spark.storage.StorageLevel
/**
* Model representing the result of matrix factorization.
*
- * Note: If you create the model directly using constructor, please be aware that fast prediction
- * requires cached user/product features and their associated partitioners.
- *
* @param rank Rank for the features in this model.
* @param userFeatures RDD of tuples where each tuple represents the userId and
* the features computed for this user.
* @param productFeatures RDD of tuples where each tuple represents the productId
* and the features computed for this product.
+ *
+ * @note If you create the model directly using constructor, please be aware that fast prediction
+ * requires cached user/product features and their associated partitioners.
*/
@Since("0.8.0")
class MatrixFactorizationModel @Since("0.8.0") (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f3159f7e72..925fdf4d7e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -60,15 +60,15 @@ object Statistics {
* Compute the correlation matrix for the input RDD of Vectors using the specified method.
* Methods currently supported: `pearson` (default), `spearman`.
*
- * Note that for Spearman, a rank correlation, we need to create an RDD[Double] for each column
- * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
- * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
- * avoid recomputing the common lineage.
- *
* @param X an RDD[Vector] for which the correlation matrix is to be computed.
* @param method String specifying the method to use for computing correlation.
* Supported: `pearson` (default), `spearman`
* @return Correlation matrix comparing columns in X.
+ *
+ * @note For Spearman, a rank correlation, we need to create an RDD[Double] for each column
+ * and sort it in order to retrieve the ranks and then join the columns back into an RDD[Vector],
+ * which is fairly costly. Cache the input RDD before calling corr with `method = "spearman"` to
+ * avoid recomputing the common lineage.
*/
@Since("1.1.0")
def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
@@ -77,12 +77,12 @@ object Statistics {
* Compute the Pearson correlation for the input RDDs.
* Returns NaN if either vector has 0 variance.
*
- * Note: the two input RDDs need to have the same number of partitions and the same number of
- * elements in each partition.
- *
* @param x RDD[Double] of the same cardinality as y.
* @param y RDD[Double] of the same cardinality as x.
* @return A Double containing the Pearson correlation between the two input RDD[Double]s
+ *
+ * @note The two input RDDs need to have the same number of partitions and the same number of
+ * elements in each partition.
*/
@Since("1.1.0")
def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
@@ -98,15 +98,15 @@ object Statistics {
* Compute the correlation for the input RDDs using the specified method.
* Methods currently supported: `pearson` (default), `spearman`.
*
- * Note: the two input RDDs need to have the same number of partitions and the same number of
- * elements in each partition.
- *
* @param x RDD[Double] of the same cardinality as y.
* @param y RDD[Double] of the same cardinality as x.
* @param method String specifying the method to use for computing correlation.
* Supported: `pearson` (default), `spearman`
* @return A Double containing the correlation between the two input RDD[Double]s using the
* specified method.
+ *
+ * @note The two input RDDs need to have the same number of partitions and the same number of
+ * elements in each partition.
*/
@Since("1.1.0")
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
@@ -122,15 +122,15 @@ object Statistics {
* Conduct Pearson's chi-squared goodness of fit test of the observed data against the
* expected distribution.
*
- * Note: the two input Vectors need to have the same size.
- * `observed` cannot contain negative values.
- * `expected` cannot contain nonpositive values.
- *
* @param observed Vector containing the observed categorical counts/relative frequencies.
* @param expected Vector containing the expected categorical counts/relative frequencies.
* `expected` is rescaled if the `expected` sum differs from the `observed` sum.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
+ *
+ * @note The two input Vectors need to have the same size.
+ * `observed` cannot contain negative values.
+ * `expected` cannot contain nonpositive values.
*/
@Since("1.1.0")
def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
@@ -141,11 +141,11 @@ object Statistics {
* Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
* distribution, with each category having an expected frequency of `1 / observed.size`.
*
- * Note: `observed` cannot contain negative values.
- *
* @param observed Vector containing the observed categorical counts/relative frequencies.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
+ *
+ * @note `observed` cannot contain negative values.
*/
@Since("1.1.0")
def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 36feab7859..d846c43cf2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -75,10 +75,6 @@ object DecisionTree extends Serializable with Logging {
* Method to train a decision tree model.
* The method supports binary and multiclass classification and regression.
*
- * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
- * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
- * is recommended to clearly separate classification and regression.
- *
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* For classification, labels should take values {0, 1, ..., numClasses-1}.
* For regression, labels are real numbers.
@@ -86,6 +82,10 @@ object DecisionTree extends Serializable with Logging {
* of decision tree (classification or regression), feature type (continuous,
* categorical), depth of the tree, quantile calculation strategy, etc.
* @return DecisionTreeModel that can be used for prediction.
+ *
+ * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+ * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+ * is recommended to clearly separate classification and regression.
*/
@Since("1.0.0")
def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = {
@@ -96,10 +96,6 @@ object DecisionTree extends Serializable with Logging {
* Method to train a decision tree model.
* The method supports binary and multiclass classification and regression.
*
- * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
- * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
- * is recommended to clearly separate classification and regression.
- *
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* For classification, labels should take values {0, 1, ..., numClasses-1}.
* For regression, labels are real numbers.
@@ -108,6 +104,10 @@ object DecisionTree extends Serializable with Logging {
* @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means
* 1 internal node + 2 leaf nodes).
* @return DecisionTreeModel that can be used for prediction.
+ *
+ * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+ * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+ * is recommended to clearly separate classification and regression.
*/
@Since("1.0.0")
def train(
@@ -123,10 +123,6 @@ object DecisionTree extends Serializable with Logging {
* Method to train a decision tree model.
* The method supports binary and multiclass classification and regression.
*
- * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
- * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
- * is recommended to clearly separate classification and regression.
- *
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* For classification, labels should take values {0, 1, ..., numClasses-1}.
* For regression, labels are real numbers.
@@ -136,6 +132,10 @@ object DecisionTree extends Serializable with Logging {
* 1 internal node + 2 leaf nodes).
* @param numClasses Number of classes for classification. Default value of 2.
* @return DecisionTreeModel that can be used for prediction.
+ *
+ * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+ * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+ * is recommended to clearly separate classification and regression.
*/
@Since("1.2.0")
def train(
@@ -152,10 +152,6 @@ object DecisionTree extends Serializable with Logging {
* Method to train a decision tree model.
* The method supports binary and multiclass classification and regression.
*
- * Note: Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
- * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
- * is recommended to clearly separate classification and regression.
- *
* @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* For classification, labels should take values {0, 1, ..., numClasses-1}.
* For regression, labels are real numbers.
@@ -170,6 +166,10 @@ object DecisionTree extends Serializable with Logging {
* indicates that feature n is categorical with k categories
* indexed from 0: {0, 1, ..., k-1}.
* @return DecisionTreeModel that can be used for prediction.
+ *
+ * @note Using [[org.apache.spark.mllib.tree.DecisionTree$#trainClassifier]]
+ * and [[org.apache.spark.mllib.tree.DecisionTree$#trainRegressor]]
+ * is recommended to clearly separate classification and regression.
*/
@Since("1.0.0")
def train(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
index de14ddf024..09274a2e1b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/loss/Loss.scala
@@ -42,11 +42,13 @@ trait Loss extends Serializable {
/**
* Method to calculate error of the base learner for the gradient boosting calculation.
- * Note: This method is not used by the gradient boosting algorithm but is useful for debugging
- * purposes.
+ *
* @param model Model of the weak learner.
* @param data Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
* @return Measure of model error on data
+ *
+ * @note This method is not used by the gradient boosting algorithm but is useful for debugging
+ * purposes.
*/
@Since("1.2.0")
def computeError(model: TreeEnsembleModel, data: RDD[LabeledPoint]): Double = {
@@ -55,11 +57,13 @@ trait Loss extends Serializable {
/**
* Method to calculate loss when the predictions are already known.
- * Note: This method is used in the method evaluateEachIteration to avoid recomputing the
- * predicted values from previously fit trees.
+ *
* @param prediction Predicted label.
* @param label True label.
* @return Measure of model error on datapoint.
+ *
+ * @note This method is used in the method evaluateEachIteration to avoid recomputing the
+ * predicted values from previously fit trees.
*/
private[spark] def computeError(prediction: Double, label: Double): Double
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
index 657ed0a8ec..299950785e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala
@@ -187,7 +187,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
* @param initTreeWeight: learning rate assigned to the first tree.
* @param initTree: first DecisionTreeModel.
* @param loss: evaluation metric.
- * @return a RDD with each element being a zip of the prediction and error
+ * @return an RDD with each element being a zip of the prediction and error
* corresponding to every sample.
*/
@Since("1.4.0")
@@ -213,7 +213,7 @@ object GradientBoostedTreesModel extends Loader[GradientBoostedTreesModel] {
* @param treeWeight: Learning rate.
* @param tree: Tree using which the prediction and error should be updated.
* @param loss: evaluation metric.
- * @return a RDD with each element being a zip of the prediction and error
+ * @return an RDD with each element being a zip of the prediction and error
* corresponding to each sample.
*/
@Since("1.4.0")