aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Predictor.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala74
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala14
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala42
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala14
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala4
-rwxr-xr-xmllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/package.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala27
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala38
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala28
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala21
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala24
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala20
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala24
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/package.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala2
69 files changed, 336 insertions, 310 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index aa92edde7a..4b43a3aa5b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -59,7 +59,7 @@ private[ml] trait PredictorParams extends Params
/**
* :: DeveloperApi ::
* Abstraction for prediction problems (regression and classification). It accepts all NumericType
- * labels and will automatically cast it to DoubleType in [[fit()]].
+ * labels and will automatically cast it to DoubleType in `fit()`.
*
* @tparam FeaturesType Type of features.
* E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
index 12b9732a4c..527cb2d547 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala
@@ -239,7 +239,7 @@ object AttributeGroup {
}
}
- /** Creates an attribute group from a [[StructField]] instance. */
+ /** Creates an attribute group from a `StructField` instance. */
def fromStructField(field: StructField): AttributeGroup = {
require(field.dataType == new VectorUDT)
if (field.metadata.contains(ML_ATTR)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
index 27554acdf3..cc7e8bc301 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala
@@ -98,7 +98,7 @@ sealed abstract class Attribute extends Serializable {
def toMetadata(): Metadata = toMetadata(Metadata.empty)
/**
- * Converts to a [[StructField]] with some existing metadata.
+ * Converts to a `StructField` with some existing metadata.
* @param existingMetadata existing metadata to carry over
*/
def toStructField(existingMetadata: Metadata): StructField = {
@@ -109,7 +109,7 @@ sealed abstract class Attribute extends Serializable {
StructField(name.get, DoubleType, nullable = false, newMetadata)
}
- /** Converts to a [[StructField]]. */
+ /** Converts to a `StructField`. */
def toStructField(): StructField = toStructField(Metadata.empty)
override def toString: String = toMetadataImpl(withType = true).toString
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index d07b4adebb..fe29926e0d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -56,13 +56,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
* Set threshold in binary classification, in range [0, 1].
*
- * If the estimated probability of class label 1 is > threshold, then predict 1, else 0.
+ * If the estimated probability of class label 1 is > threshold, then predict 1, else 0.
* A high threshold encourages the model to predict 0 more often;
* a low threshold encourages the model to predict 1 more often.
*
* Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`.
- * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared.
- * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be
+ * When `setThreshold()` is called, any user-set value for `thresholds` will be cleared.
+ * If both `threshold` and `thresholds` are set in a ParamMap, then they must be
* equivalent.
*
* Default is 0.5.
@@ -101,12 +101,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
* Get threshold for binary classification.
*
- * If [[thresholds]] is set with length 2 (i.e., binary classification),
+ * If `thresholds` is set with length 2 (i.e., binary classification),
* this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}.
- * Otherwise, returns [[threshold]] if set, or its default value if unset.
+ * Otherwise, returns `threshold` if set, or its default value if unset.
*
* @group getParam
- * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2.
+ * @throws IllegalArgumentException if `thresholds` is set to an array of length other than 2.
*/
override def getThreshold: Double = {
checkThresholdConsistency()
@@ -122,13 +122,13 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
* Set thresholds in multiclass (or binary) classification to adjust the probability of
- * predicting each class. Array must have length equal to the number of classes, with values > 0,
- * excepting that at most one value may be 0.
+ * predicting each class. Array must have length equal to the number of classes,
+ * with values > 0, excepting that at most one value may be 0.
* The class with largest value p/t is predicted, where p is the original probability of that
* class and t is the class's threshold.
*
- * Note: When [[setThresholds()]] is called, any user-set value for [[threshold]] will be cleared.
- * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be
+ * Note: When `setThresholds()` is called, any user-set value for `threshold` will be cleared.
+ * If both `threshold` and `thresholds` are set in a ParamMap, then they must be
* equivalent.
*
* @group setParam
@@ -141,8 +141,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
* Get thresholds for binary or multiclass classification.
*
- * If [[thresholds]] is set, return its value.
- * Otherwise, if [[threshold]] is set, return the equivalent thresholds for binary
+ * If `thresholds` is set, return its value.
+ * Otherwise, if `threshold` is set, return the equivalent thresholds for binary
* classification: (1-threshold, threshold).
* If neither are set, throw an exception.
*
@@ -159,9 +159,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
}
/**
- * If [[threshold]] and [[thresholds]] are both set, ensures they are consistent.
+ * If `threshold` and `thresholds` are both set, ensures they are consistent.
*
- * @throws IllegalArgumentException if [[threshold]] and [[thresholds]] are not equivalent
+ * @throws IllegalArgumentException if `threshold` and `thresholds` are not equivalent
*/
protected def checkThresholdConsistency(): Unit = {
if (isSet(threshold) && isSet(thresholds)) {
@@ -207,7 +207,7 @@ class LogisticRegression @Since("1.2.0") (
/**
* Set the ElasticNet mixing parameter.
* For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
- * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
+ * For 0 &lt; alpha &lt; 1, the penalty is a combination of L1 and L2.
* Default is 0.0 which is an L2 penalty.
*
* @group setParam
@@ -294,7 +294,7 @@ class LogisticRegression @Since("1.2.0") (
override def getThresholds: Array[Double] = super.getThresholds
/**
- * Suggested depth for treeAggregate (>= 2).
+ * Suggested depth for treeAggregate (&gt;= 2).
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
@@ -815,7 +815,7 @@ class LogisticRegressionModel private[spark] (
/**
* Predict label for the given feature vector.
- * The behavior of this can be adjusted using [[thresholds]].
+ * The behavior of this can be adjusted using `thresholds`.
*/
override protected def predict(features: Vector): Double = if (isMultinomial) {
super.predict(features)
@@ -1274,7 +1274,7 @@ class BinaryLogisticRegressionSummary private[classification] (
*
* The probability of the multinomial outcome $y$ taking on any of the K possible outcomes is:
*
- * <p><blockquote>
+ * <blockquote>
* $$
* P(y_i=0|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1}
* e^{\vec{x}_i^T \vec{\beta}_k}} \\
@@ -1283,7 +1283,7 @@ class BinaryLogisticRegressionSummary private[classification] (
* P(y_i=K-1|\vec{x}_i, \beta) = \frac{e^{\vec{x}_i^T \vec{\beta}_{K-1}}\,}{\sum_{k=0}^{K-1}
* e^{\vec{x}_i^T \vec{\beta}_k}}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* The model coefficients $\beta = (\beta_0, \beta_1, \beta_2, ..., \beta_{K-1})$ become a matrix
* which has dimension of $K \times (N+1)$ if the intercepts are added. If the intercepts are not
@@ -1292,7 +1292,7 @@ class BinaryLogisticRegressionSummary private[classification] (
* Note that the coefficients in the model above lack identifiability. That is, any constant scalar
* can be added to all of the coefficients and the probabilities remain the same.
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* \frac{e^{\vec{x}_i^T \left(\vec{\beta}_0 + \vec{c}\right)}}{\sum_{k=0}^{K-1}
@@ -1302,7 +1302,7 @@ class BinaryLogisticRegressionSummary private[classification] (
* = \frac{e^{\vec{x}_i^T \vec{\beta}_0}}{\sum_{k=0}^{K-1} e^{\vec{x}_i^T \vec{\beta}_k}}
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* However, when regularization is added to the loss function, the coefficients are indeed
* identifiable because there is only one set of coefficients which minimizes the regularization
@@ -1314,7 +1314,7 @@ class BinaryLogisticRegressionSummary private[classification] (
* The loss of objective function for a single instance of data (we do not include the
* regularization term here for simplicity) can be written as
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* \ell\left(\beta, x_i\right) &= -log{P\left(y_i \middle| \vec{x}_i, \beta\right)} \\
@@ -1322,14 +1322,14 @@ class BinaryLogisticRegressionSummary private[classification] (
* &= log\left(\sum_{k=0}^{K-1} e^{margins_k}\right) - margins_y
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where ${margins}_k = \vec{x}_i^T \vec{\beta}_k$.
*
* For optimization, we have to calculate the first derivative of the loss function, and a simple
* calculation shows that
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* \frac{\partial \ell(\beta, \vec{x}_i, w_i)}{\partial \beta_{j, k}}
@@ -1338,54 +1338,54 @@ class BinaryLogisticRegressionSummary private[classification] (
* &= x_{i, j} \cdot w_i \cdot multiplier_k
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where $w_i$ is the sample weight, $I_{y=k}$ is an indicator function
*
- * <p><blockquote>
+ * <blockquote>
* $$
* I_{y=k} = \begin{cases}
* 1 & y = k \\
* 0 & else
* \end{cases}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* and
*
- * <p><blockquote>
+ * <blockquote>
* $$
* multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k}}{\sum_{k=0}^{K-1}
* e^{\vec{x}_i \cdot \vec{\beta}_k}} - I_{y=k}\right)
* $$
- * </blockquote></p>
+ * </blockquote>
*
* If any of margins is larger than 709.78, the numerical computation of multiplier and loss
* function will suffer from arithmetic overflow. This issue occurs when there are outliers in
* data which are far away from the hyperplane, and this will cause the failing of training once
- * infinity is introduced. Note that this is only a concern when max(margins) > 0.
+ * infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
*
- * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can easily
- * be rewritten into the following equivalent numerically stable formula.
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can
+ * easily be rewritten into the following equivalent numerically stable formula.
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \ell\left(\beta, x\right) = log\left(\sum_{k=0}^{K-1} e^{margins_k - maxMargin}\right) -
* margins_{y} + maxMargin
* $$
- * </blockquote></p>
+ * </blockquote>
*
* Note that each term, $(margins_k - maxMargin)$ in the exponential is no greater than zero; as a
* result, overflow will not happen with this formula.
*
* For $multiplier$, a similar trick can be applied as the following,
*
- * <p><blockquote>
+ * <blockquote>
* $$
* multiplier_k = \left(\frac{e^{\vec{x}_i \cdot \vec{\beta}_k - maxMargin}}{\sum_{k'=0}^{K-1}
* e^{\vec{x}_i \cdot \vec{\beta}_{k'} - maxMargin}} - I_{y=k}\right)
* $$
- * </blockquote></p>
+ * </blockquote>
*
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param bcFeaturesStd The broadcast standard deviation values of the features.
@@ -1513,7 +1513,7 @@ private class LogisticAggregator(
}
/**
- * When maxMargin > 0, the original formula could cause overflow.
+ * When maxMargin &gt; 0, the original formula could cause overflow.
* We address this by subtracting maxMargin from all the margins, so it's guaranteed
* that all of the new margins will be smaller than zero to prevent arithmetic overflow.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 88fe7cb4a6..1b45eafbac 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -289,7 +289,6 @@ object MultilayerPerceptronClassifier
* @param uid uid
* @param layers array of layer sizes including input and output layers
* @param weights the weights of layers
- * @return prediction model
*/
@Since("1.5.0")
@Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index f1a7676c74..a2ac700000 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -60,16 +60,20 @@ private[ml] trait NaiveBayesParams extends PredictorParams with HasWeightCol {
final def getModelType: String = $(modelType)
}
+// scalastyle:off line.size.limit
/**
* Naive Bayes Classifiers.
* It supports Multinomial NB
- * ([[http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html]])
+ * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html">
+ * here</a>)
* which can handle finitely supported discrete data. For example, by converting documents into
* TF-IDF vectors, it can be used for document classification. By making every vector a
* binary (0/1) data, it can also be used as Bernoulli NB
- * ([[http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html]]).
+ * (see <a href="http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html">
+ * here</a>).
* The input feature values must be nonnegative.
*/
+// scalastyle:on line.size.limit
@Since("1.5.0")
class NaiveBayes @Since("1.5.0") (
@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 52345b0626..907c73e2e4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.functions._
/**
- * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] learning algorithm for
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> learning algorithm for
* classification.
* It supports both binary and multiclass labels, as well as both continuous and categorical
* features.
@@ -144,7 +144,7 @@ object RandomForestClassifier extends DefaultParamsReadable[RandomForestClassifi
}
/**
- * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for classification.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for classification.
* It supports both binary and multiclass labels, as well as both continuous and categorical
* features.
*
@@ -249,7 +249,7 @@ class RandomForestClassificationModel private[ml] (
* (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
* and follows the implementation from scikit-learn.
*
- * @see [[DecisionTreeClassificationModel.featureImportances]]
+ * @see `DecisionTreeClassificationModel.featureImportances`
*/
@Since("1.5.0")
lazy val featureImportances: Vector = TreeEnsembleModel.featureImportances(trees, numFeatures)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index cf11ba37ab..c7a170ddc7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -42,7 +42,7 @@ private[clustering] trait BisectingKMeansParams extends Params
with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
/**
- * The desired number of leaf clusters. Must be > 1. Default: 4.
+ * The desired number of leaf clusters. Must be &gt; 1. Default: 4.
* The actual number could be smaller if there are no divisible leaf clusters.
* @group param
*/
@@ -55,8 +55,8 @@ private[clustering] trait BisectingKMeansParams extends Params
def getK: Int = $(k)
/**
- * The minimum number of points (if >= 1.0) or the minimum proportion
- * of points (if < 1.0) of a divisible cluster (default: 1.0).
+ * The minimum number of points (if &gt;= 1.0) or the minimum proportion
+ * of points (if &lt; 1.0) of a divisible cluster (default: 1.0).
* @group expertParam
*/
@Since("2.0.0")
@@ -208,9 +208,9 @@ object BisectingKMeansModel extends MLReadable[BisectingKMeansModel] {
* If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters,
* larger clusters get higher priority.
*
- * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
- * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
- * KDD Workshop on Text Mining, 2000.]]
+ * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
+ * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ * KDD Workshop on Text Mining, 2000.</a>
*/
@Since("2.0.0")
@Experimental
@@ -296,7 +296,7 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
* :: Experimental ::
* Summary of BisectingKMeans.
*
- * @param predictions [[DataFrame]] produced by [[BisectingKMeansModel.transform()]].
+ * @param predictions `DataFrame` produced by `BisectingKMeansModel.transform()`.
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
index 8b5f525194..44e832b058 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.{DataFrame, Row}
* :: Experimental ::
* Summary of clustering algorithms.
*
- * @param predictions [[DataFrame]] produced by model.transform().
+ * @param predictions `DataFrame` produced by model.transform().
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 19998ca44b..74109344aa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -44,7 +44,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol {
/**
- * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2.
+ * Number of independent Gaussians in the mixture model. Must be &gt; 1. Default: 2.
* @group param
*/
@Since("2.0.0")
@@ -76,7 +76,7 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
* @param weights Weight for each Gaussian distribution in the mixture.
* This is a multinomial probability distribution over the k Gaussians,
* where weights(i) is the weight for Gaussian i, and weights sum to 1.
- * @param gaussians Array of [[MultivariateGaussian]] where gaussians(i) represents
+ * @param gaussians Array of `MultivariateGaussian` where gaussians(i) represents
* the Multivariate Gaussian (Normal) Distribution for Gaussian i
*/
@Since("2.0.0")
@@ -374,7 +374,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
* :: Experimental ::
* Summary of GaussianMixture.
*
- * @param predictions [[DataFrame]] produced by [[GaussianMixtureModel.transform()]].
+ * @param predictions `DataFrame` produced by `GaussianMixtureModel.transform()`.
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param probabilityCol Name for column of predicted probability of each cluster
* in `predictions`.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 152bd13b7a..6e124eb6dd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -42,7 +42,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
with HasSeed with HasPredictionCol with HasTol {
/**
- * The number of clusters to create (k). Must be > 1. Note that it is possible for fewer than
+ * The number of clusters to create (k). Must be &gt; 1. Note that it is possible for fewer than
* k clusters to be returned, for example, if there are fewer than k distinct points to cluster.
* Default: 2.
* @group param
@@ -72,7 +72,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
/**
* Param for the number of steps for the k-means|| initialization mode. This is an advanced
- * setting -- the default of 2 is almost always enough. Must be > 0. Default: 2.
+ * setting -- the default of 2 is almost always enough. Must be &gt; 0. Default: 2.
* @group expertParam
*/
@Since("1.5.0")
@@ -250,7 +250,7 @@ object KMeansModel extends MLReadable[KMeansModel] {
* :: Experimental ::
* K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
*
- * @see [[http://dx.doi.org/10.14778/2180912.2180915 Bahmani et al., Scalable k-means++.]]
+ * @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
*/
@Since("1.5.0")
@Experimental
@@ -346,7 +346,7 @@ object KMeans extends DefaultParamsReadable[KMeans] {
* :: Experimental ::
* Summary of KMeans.
*
- * @param predictions [[DataFrame]] produced by [[KMeansModel.transform()]].
+ * @param predictions `DataFrame` produced by `KMeansModel.transform()`.
* @param predictionCol Name for column of predicted clusters in `predictions`.
* @param featuresCol Name for column of features in `predictions`.
* @param k Number of clusters.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 7773802854..6032ab3db9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -50,7 +50,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
with HasSeed with HasCheckpointInterval {
/**
- * Param for the number of topics (clusters) to infer. Must be > 1. Default: 10.
+ * Param for the number of topics (clusters) to infer. Must be &gt; 1. Default: 10.
*
* @group param
*/
@@ -78,13 +78,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* - EM
* - Currently only supports symmetric distributions, so all values in the vector should be
* the same.
- * - Values should be > 1.0
+ * - Values should be &gt; 1.0
* - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
* from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
* - Online
- * - Values should be >= 0
+ * - Values should be &gt;= 0
* - default = uniformly (1.0 / k), following the implementation from
- * [[https://github.com/Blei-Lab/onlineldavb]].
+ * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
*
* @group param
*/
@@ -120,13 +120,13 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*
* Optimizer-specific parameter settings:
* - EM
- * - Value should be > 1.0
+ * - Value should be &gt; 1.0
* - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
* Asuncion et al. (2009), who recommend a +1 adjustment for EM.
* - Online
- * - Value should be >= 0
+ * - Value should be &gt;= 0
* - default = (1.0 / k), following the implementation from
- * [[https://github.com/Blei-Lab/onlineldavb]].
+ * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
*
* @group param
*/
@@ -162,11 +162,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* - Online LDA:
* Hoffman, Blei and Bach. "Online Learning for Latent Dirichlet Allocation."
* Neural Information Processing Systems, 2010.
- * [[http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf]]
+ * See <a href="http://www.cs.columbia.edu/~blei/papers/HoffmanBleiBach2010b.pdf">here</a>
* - EM:
* Asuncion et al. "On Smoothing and Inference for Topic Models."
* Uncertainty in Artificial Intelligence, 2009.
- * [[http://arxiv.org/pdf/1205.2662.pdf]]
+ * See <a href="http://arxiv.org/pdf/1205.2662.pdf">here</a>
*
* @group param
*/
@@ -245,9 +245,9 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* Fraction of the corpus to be sampled and used in each iteration of mini-batch gradient descent,
* in range (0, 1].
*
- * Note that this should be adjusted in synch with [[LDA.maxIter]]
+ * Note that this should be adjusted in synch with `LDA.maxIter`
* so the entire corpus is used. Specifically, set both so that
- * maxIterations * miniBatchFraction >= 1.
+ * maxIterations * miniBatchFraction &gt;= 1.
*
* Note: This is the same as the `miniBatchFraction` parameter in
* [[org.apache.spark.mllib.clustering.OnlineLDAOptimizer]].
@@ -293,8 +293,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* cause failures if a data partition is lost, so set this bit with care.
* Note that checkpoints will be cleaned up via reference counting, regardless.
*
- * See [[DistributedLDAModel.getCheckpointFiles]] for getting remaining checkpoints and
- * [[DistributedLDAModel.deleteCheckpointFiles]] for removing remaining checkpoints.
+ * See `DistributedLDAModel.getCheckpointFiles` for getting remaining checkpoints and
+ * `DistributedLDAModel.deleteCheckpointFiles` for removing remaining checkpoints.
*
* Default: true
*
@@ -431,7 +431,7 @@ sealed abstract class LDAModel private[ml] (
private[ml] def getEffectiveTopicConcentration: Double = getModel.topicConcentration
/**
- * The features for LDA should be a [[Vector]] representing the word counts in a document.
+ * The features for LDA should be a `Vector` representing the word counts in a document.
* The vector should be of length vocabSize, with counts for each term (word).
*
* @group setParam
@@ -650,7 +650,7 @@ object LocalLDAModel extends MLReadable[LocalLDAModel] {
* for each training document.
*
* @param oldLocalModelOption Used to implement [[oldLocalModel]] as a lazy val, but keeping
- * [[copy()]] cheap.
+ * `copy()` cheap.
*/
@Since("1.6.0")
@Experimental
@@ -701,7 +701,7 @@ class DistributedLDAModel private[ml] (
* - Even with [[logPrior]], this is NOT the same as the data log likelihood given the
* hyperparameters.
* - This is computed from the topic distributions computed during training. If you call
- * [[logLikelihood()]] on the same training dataset, the topic distributions will be computed
+ * `logLikelihood()` on the same training dataset, the topic distributions will be computed
* again, possibly giving different results.
*/
@Since("1.6.0")
@@ -719,7 +719,7 @@ class DistributedLDAModel private[ml] (
/**
* :: DeveloperApi ::
*
- * If using checkpointing and [[LDA.keepLastCheckpoint]] is set to true, then there may be
+ * If using checkpointing and `LDA.keepLastCheckpoint` is set to true, then there may be
* saved checkpoint files. This method is provided so that users can manage those files.
*
* Note that removing the checkpoints can cause failures if a partition is lost and is needed
@@ -804,13 +804,13 @@ object DistributedLDAModel extends MLReadable[DistributedLDAModel] {
*
* Input data (featuresCol):
* LDA is given a collection of documents as input data, via the featuresCol parameter.
- * Each document is specified as a [[Vector]] of length vocabSize, where each entry is the
+ * Each document is specified as a `Vector` of length vocabSize, where each entry is the
* count for the corresponding term (word) in the document. Feature transformers such as
* [[org.apache.spark.ml.feature.Tokenizer]] and [[org.apache.spark.ml.feature.CountVectorizer]]
* can be useful for converting text to word count vectors.
*
- * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
- * (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">
+ * Latent Dirichlet allocation (Wikipedia)</a>
*/
@Since("1.6.0")
@Experimental
@@ -826,7 +826,7 @@ class LDA @Since("1.6.0") (
optimizeDocConcentration -> true, keepLastCheckpoint -> true)
/**
- * The features for LDA should be a [[Vector]] representing the word counts in a document.
+ * The features for LDA should be a `Vector` representing the word counts in a document.
* The vector should be of length vocabSize, with counts for each term (word).
*
* @group setParam
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
index 6ff36b35ca..682787a830 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/DCT.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.types.DataType
* It returns a real vector of the same length representing the DCT. The return vector is scaled
* such that the transform matrix is unitary (aka scaled DCT-II).
*
- * More information on [[https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia]].
+ * More information on <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II">
+ * DCT-II in Discrete cosine transform (Wikipedia)</a>.
*/
@Since("1.5.0")
class DCT @Since("1.5.0") (@Since("1.5.0") override val uid: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
index d9d0f32254..f37233e1ab 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
@@ -37,7 +37,8 @@ import org.apache.spark.sql.types.StructType
* where `k_i` is the i-th coefficient, and both `x` and `k_i` are from `Z_prime^*`
*
* Reference:
- * [[https://en.wikipedia.org/wiki/Perfect_hash_function Wikipedia on Perfect Hash Function]]
+ * <a href="https://en.wikipedia.org/wiki/Perfect_hash_function">
+ * Wikipedia on Perfect Hash Function</a>
*
* @param numEntries The number of entries of the hash functions.
* @param randCoefficients An array of random coefficients, each used by one hash function.
@@ -98,7 +99,7 @@ class MinHashModel private[ml] (
* as binary "1" values.
*
* References:
- * [[https://en.wikipedia.org/wiki/MinHash Wikipedia on MinHash]]
+ * <a href="https://en.wikipedia.org/wiki/MinHash">Wikipedia on MinHash</a>
*/
@Experimental
@Since("2.1.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
index ccfb0ce8f8..19978c97d2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala
@@ -78,11 +78,11 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
* statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
* feature E is calculated as:
*
- * <p><blockquote>
+ * <blockquote>
* $$
* Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
* $$
- * </blockquote></p>
+ * </blockquote>
*
* For the case $E_{max} == E_{min}$, $Rescaled(e_i) = 0.5 * (max + min)$.
*
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index 25fb6be5af..4be17da3e9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -30,10 +30,12 @@ import org.apache.spark.sql.types.DataType
/**
* Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion,
- * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], "In mathematics, an
- * expansion of a product of sums expresses it as a sum of products by using the fact that
- * multiplication distributes over addition". Take a 2-variable feature vector as an example:
- * `(x, y)`, if we want to expand it with degree 2, then we get `(x, x * x, y, x * y, y * y)`.
+ * which is available at
+ * <a href="http://en.wikipedia.org/wiki/Polynomial_expansion">Polynomial expansion (Wikipedia)</a>
+ * , "In mathematics, an expansion of a product of sums expresses it as a sum of products by using
+ * the fact that multiplication distributes over addition". Take a 2-variable feature vector
+ * as an example: `(x, y)`, if we want to expand it with degree 2, then we get
+ * `(x, x * x, y, x * y, y * y)`.
*/
@Since("1.4.0")
class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: String)
@@ -76,11 +78,11 @@ class PolynomialExpansion @Since("1.4.0") (@Since("1.4.0") override val uid: Str
* (n + d choose d) (including 1 and first-order values). For example, let f([a, b, c], 3) be the
* function that expands [a, b, c] to their monomials of degree 3. We have the following recursion:
*
- * <p><blockquote>
+ * <blockquote>
* $$
* f([a, b, c], 3) &= f([a, b], 3) ++ f([a, b], 2) * c ++ f([a, b], 1) * c^2 ++ [c^3]
* $$
- * </blockquote></p>
+ * </blockquote>
*
* To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
* current index and increment it properly for sparse input.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
index 1b524c6710..2bff59a0da 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RandomProjection.scala
@@ -113,8 +113,8 @@ class RandomProjectionModel private[ml] (
*
* References:
*
- * 1. [[https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions
- * Wikipedia on Stable Distributions]]
+ * 1. <a href="https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Stable_distributions">
+ * Wikipedia on Stable Distributions</a>
*
* 2. Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint
* arXiv:1408.2927 (2014).
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
index d76d556280..8f125d8fd5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala
@@ -79,8 +79,8 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
* statistics on the samples in the training set.
*
* The "unit std" is computed using the
- * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation
- * corrected sample standard deviation]],
+ * <a href="https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation">
+ * corrected sample standard deviation</a>,
* which is computed as the square root of the unbiased sample variance.
*/
@Since("1.2.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
index 0ced21365f..a55816249c 100755
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
* @note null values from input array are preserved unless adding null to stopWords
* explicitly.
*
- * @see [[http://en.wikipedia.org/wiki/Stop_words]]
+ * @see <a href="http://en.wikipedia.org/wiki/Stop_words">Stop words (Wikipedia)</a>
*/
@Since("1.5.0")
class StopWordsRemover @Since("1.5.0") (@Since("1.5.0") override val uid: String)
@@ -132,7 +132,8 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] {
* Loads the default stop words for the given language.
* Supported languages: danish, dutch, english, finnish, french, german, hungarian,
* italian, norwegian, portuguese, russian, spanish, swedish, turkish
- * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]]
+ * @see <a href="http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/">
+ * here</a>
*/
@Since("2.0.0")
def loadDefaultStopWords(language: String): Array[String] = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
index b94187ae78..5dd648aecc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala
@@ -84,6 +84,7 @@ import org.apache.spark.sql.DataFrame
* input dataset, while MLlib's feature transformers operate lazily on individual columns,
* which is more efficient and flexible to handle large and complex datasets.
*
- * @see [[http://scikit-learn.org/stable/modules/preprocessing.html scikit-learn.preprocessing]]
+ * @see <a href="http://scikit-learn.org/stable/modules/preprocessing.html">
+ * scikit-learn.preprocessing</a>
*/
package object feature
diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
index 8a6b862cda..143bf539b0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/IterativelyReweightedLeastSquares.scala
@@ -50,9 +50,10 @@ private[ml] class IterativelyReweightedLeastSquaresModel(
* @param maxIter maximum number of iterations.
* @param tol the convergence tolerance.
*
- * @see [[http://www.jstor.org/stable/2345503 P. J. Green, Iteratively Reweighted Least Squares
- * for Maximum Likelihood Estimation, and some Robust and Resistant Alternatives,
- * Journal of the Royal Statistical Society. Series B, 1984.]]
+ * @see <a href="http://www.jstor.org/stable/2345503">P. J. Green, Iteratively
+ * Reweighted Least Squares for Maximum Likelihood Estimation, and some Robust
+ * and Resistant Alternatives, Journal of the Royal Statistical Society.
+ * Series B, 1984.</a>
*/
private[ml] class IterativelyReweightedLeastSquares(
val initialModel: WeightedLeastSquaresModel,
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index fa4530927e..e3e03dfd43 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -29,7 +29,7 @@ import org.apache.spark.ml.param._
private[ml] trait HasRegParam extends Params {
/**
- * Param for regularization parameter (>= 0).
+ * Param for regularization parameter (&gt;= 0).
* @group param
*/
final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0))
@@ -44,7 +44,7 @@ private[ml] trait HasRegParam extends Params {
private[ml] trait HasMaxIter extends Params {
/**
- * Param for maximum number of iterations (>= 0).
+ * Param for maximum number of iterations (&gt;= 0).
* @group param
*/
final val maxIter: IntParam = new IntParam(this, "maxIter", "maximum number of iterations (>= 0)", ParamValidators.gtEq(0))
@@ -238,7 +238,7 @@ private[ml] trait HasOutputCol extends Params {
private[ml] trait HasCheckpointInterval extends Params {
/**
- * Param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+ * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
* @group param
*/
final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
@@ -334,7 +334,7 @@ private[ml] trait HasElasticNetParam extends Params {
private[ml] trait HasTol extends Params {
/**
- * Param for the convergence tolerance for iterative algorithms (>= 0).
+ * Param for the convergence tolerance for iterative algorithms (&gt;= 0).
* @group param
*/
final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms (>= 0)", ParamValidators.gtEq(0))
@@ -349,7 +349,7 @@ private[ml] trait HasTol extends Params {
private[ml] trait HasStepSize extends Params {
/**
- * Param for Step size to be used for each iteration of optimization (> 0).
+ * Param for Step size to be used for each iteration of optimization (&gt; 0).
* @group param
*/
final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization (> 0)", ParamValidators.gt(0))
@@ -396,7 +396,7 @@ private[ml] trait HasSolver extends Params {
private[ml] trait HasAggregationDepth extends Params {
/**
- * Param for suggested depth for treeAggregate (>= 2).
+ * Param for suggested depth for treeAggregate (&gt;= 2).
* @group expertParam
*/
final val aggregationDepth: IntParam = new IntParam(this, "aggregationDepth", "suggested depth for treeAggregate (>= 2)", ParamValidators.gtEq(2))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 9d5ba99978..d6ad1ea6d1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -119,7 +119,8 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
/**
* :: Experimental ::
* Fit a parametric survival regression model named accelerated failure time (AFT) model
- * ([[https://en.wikipedia.org/wiki/Accelerated_failure_time_model]])
+ * (see <a href="https://en.wikipedia.org/wiki/Accelerated_failure_time_model">
+ * Accelerated failure time model (Wikipedia)</a>)
* based on the Weibull distribution of the survival time.
*/
@Experimental
@@ -432,24 +433,24 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
* Given the values of the covariates $x^{'}$, for random lifetime $t_{i}$ of subjects i = 1,..,n,
* with possible right-censoring, the likelihood function under the AFT model is given as
*
- * <p><blockquote>
+ * <blockquote>
* $$
* L(\beta,\sigma)=\prod_{i=1}^n[\frac{1}{\sigma}f_{0}
* (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})]^{\delta_{i}}S_{0}
* (\frac{\log{t_{i}}-x^{'}\beta}{\sigma})^{1-\delta_{i}}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* Where $\delta_{i}$ is the indicator of the event has occurred i.e. uncensored or not.
* Using $\epsilon_{i}=\frac{\log{t_{i}}-x^{'}\beta}{\sigma}$, the log-likelihood function
* assumes the form
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \iota(\beta,\sigma)=\sum_{i=1}^{n}[-\delta_{i}\log\sigma+
* \delta_{i}\log{f_{0}}(\epsilon_{i})+(1-\delta_{i})\log{S_{0}(\epsilon_{i})}]
* $$
- * </blockquote></p>
+ * </blockquote>
* Where $S_{0}(\epsilon_{i})$ is the baseline survivor function,
* and $f_{0}(\epsilon_{i})$ is corresponding density function.
*
@@ -458,34 +459,34 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
* to extreme value distribution for log of the lifetime,
* and the $S_{0}(\epsilon)$ function is
*
- * <p><blockquote>
+ * <blockquote>
* $$
* S_{0}(\epsilon_{i})=\exp(-e^{\epsilon_{i}})
* $$
- * </blockquote></p>
+ * </blockquote>
*
* and the $f_{0}(\epsilon_{i})$ function is
*
- * <p><blockquote>
+ * <blockquote>
* $$
* f_{0}(\epsilon_{i})=e^{\epsilon_{i}}\exp(-e^{\epsilon_{i}})
* $$
- * </blockquote></p>
+ * </blockquote>
*
* The log-likelihood function for Weibull distribution of lifetime is
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \iota(\beta,\sigma)=
* -\sum_{i=1}^n[\delta_{i}\log\sigma-\delta_{i}\epsilon_{i}+e^{\epsilon_{i}}]
* $$
- * </blockquote></p>
+ * </blockquote>
*
* Due to minimizing the negative log-likelihood equivalent to maximum a posteriori probability,
* the loss function we use to optimize is $-\iota(\beta,\sigma)$.
* The gradient functions for $\beta$ and $\log\sigma$ respectively are
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \frac{\partial (-\iota)}{\partial \beta}=
* \sum_{1=1}^{n}[\delta_{i}-e^{\epsilon_{i}}]\frac{x_{i}}{\sigma} \\
@@ -493,7 +494,7 @@ object AFTSurvivalRegressionModel extends MLReadable[AFTSurvivalRegressionModel]
* \frac{\partial (-\iota)}{\partial (\log\sigma)}=
* \sum_{i=1}^{n}[\delta_{i}+(\delta_{i}-e^{\epsilon_{i}})\epsilon_{i}]
* $$
- * </blockquote></p>
+ * </blockquote>
*
* @param bcParameters The broadcasted value includes three part: The log of scale parameter,
* the intercept and regression coefficients corresponding to the features.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 1419da8747..894b6a2ca2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -38,8 +38,8 @@ import org.apache.spark.sql.functions._
/**
- * [[http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree]] learning algorithm
- * for regression.
+ * <a href="http://en.wikipedia.org/wiki/Decision_tree_learning">Decision tree</a>
+ * learning algorithm for regression.
* It supports both continuous and categorical features.
*/
@Since("1.4.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index fa69d60836..ed2d05525d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
/**
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
* learning algorithm for regression.
* It supports both continuous and categorical features.
*
@@ -151,7 +151,7 @@ object GBTRegressor extends DefaultParamsReadable[GBTRegressor] {
}
/**
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Gradient-Boosted Trees (GBTs)</a>
* model for regression.
* It supports both continuous and categorical features.
* @param _trees Decision trees in the ensemble.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index f33dd0fd29..1201ecd5e4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -123,9 +123,11 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
/**
* :: Experimental ::
*
- * Fit a Generalized Linear Model ([[https://en.wikipedia.org/wiki/Generalized_linear_model]])
- * specified by giving a symbolic description of the linear predictor (link function) and
- * a description of the error distribution (family).
+ * Fit a Generalized Linear Model
+ * (see <a href="https://en.wikipedia.org/wiki/Generalized_linear_model">
+ * Generalized linear model (Wikipedia)</a>)
+ * specified by giving a symbolic description of the linear
+ * predictor (link function) and a description of the error distribution (family).
* It supports "gaussian", "binomial", "poisson" and "gamma" as family.
* Valid link functions for each family is listed below. The first link function of each family
* is the default one.
@@ -196,11 +198,11 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
/**
* Sets the regularization parameter for L2 regularization.
* The regularization term is
- * <p><blockquote>
+ * <blockquote>
* $$
* 0.5 * regParam * L2norm(coefficients)^2
* $$
- * </blockquote></p>
+ * </blockquote>
* Default is 0.0.
*
* @group setParam
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 8ea5e1e6c4..eb4e38cc83 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -624,7 +624,8 @@ class LinearRegressionSummary private[regression] (
/**
* Returns the explained variance regression score.
* explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
- * Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Explained_variation">
+ * Wikipedia explain variation</a>
*
* @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
* This will change in later Spark versions.
@@ -664,7 +665,8 @@ class LinearRegressionSummary private[regression] (
/**
* Returns R^2^, the coefficient of determination.
- * Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
+ * Wikipedia coefficient of determination</a>
*
* @note This ignores instance weights (setting all to 1.0) from [[LinearRegression.weightCol]].
* This will change in later Spark versions.
@@ -805,11 +807,11 @@ class LinearRegressionSummary private[regression] (
* When training with intercept enabled,
* The objective function in the scaled space is given by
*
- * <p><blockquote>
+ * <blockquote>
* $$
* L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where $\bar{x_i}$ is the mean of $x_i$, $\hat{x_i}$ is the standard deviation of $x_i$,
* $\bar{y}$ is the mean of label, and $\hat{y}$ is the standard deviation of label.
@@ -820,7 +822,7 @@ class LinearRegressionSummary private[regression] (
*
* This can be rewritten as
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* L &= 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
@@ -828,34 +830,34 @@ class LinearRegressionSummary private[regression] (
* &= 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where $w_i^\prime$ is the effective coefficients defined by $w_i/\hat{x_i}$, offset is
*
- * <p><blockquote>
+ * <blockquote>
* $$
* - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}.
* $$
- * </blockquote></p>
+ * </blockquote>
*
* and diff is
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \sum_i w_i^\prime x_i - y / \hat{y} + offset
* $$
- * </blockquote></p>
+ * </blockquote>
*
* Note that the effective coefficients and offset don't depend on training dataset,
* so they can be precomputed.
*
* Now, the first derivative of the objective function in scaled space is
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \frac{\partial L}{\partial w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* However, $(x_i - \bar{x_i})$ will densify the computation, so it's not
* an ideal formula when the training dataset is sparse format.
@@ -865,7 +867,7 @@ class LinearRegressionSummary private[regression] (
* objective function from all the samples is
*
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* \frac{\partial L}{\partial w_i} &=
@@ -874,14 +876,14 @@ class LinearRegressionSummary private[regression] (
* &= 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i)
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where $correction_i = - diffSum \bar{x_i} / \hat{x_i}$
*
* A simple math can show that diffSum is actually zero, so we don't even
* need to add the correction terms in the end. From the definition of diff,
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* diffSum &= \sum_j (\sum_i w_i(x_{ij} - \bar{x_i})
@@ -890,17 +892,17 @@ class LinearRegressionSummary private[regression] (
* &= 0
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* As a result, the first derivative of the total objective function only depends on
* the training dataset, which can be easily computed in distributed fashion, and is
* sparse format friendly.
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \frac{\partial L}{\partial w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i})
* $$
- * </blockquote></p>
+ * </blockquote>
*
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param labelStd The standard deviation value of the label.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 0ad00aa6f9..d60f05eed5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -37,7 +37,8 @@ import org.apache.spark.sql.functions._
/**
- * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] learning algorithm for regression.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
+ * learning algorithm for regression.
* It supports both continuous and categorical features.
*/
@Since("1.4.0")
@@ -132,7 +133,7 @@ object RandomForestRegressor extends DefaultParamsReadable[RandomForestRegressor
}
/**
- * [[http://en.wikipedia.org/wiki/Random_forest Random Forest]] model for regression.
+ * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> model for regression.
* It supports both continuous and categorical features.
*
* @param _trees Decision trees in the ensemble.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
index e137692703..e4de8483cf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala
@@ -17,15 +17,12 @@
package org.apache.spark.ml.source.libsvm
-import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.sql.{DataFrame, DataFrameReader}
-
/**
- * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]].
- * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and
- * `features` containing feature vectors stored as [[Vector]]s.
+ * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as `DataFrame`.
+ * The loaded `DataFrame` has two columns: `label` containing labels stored as doubles and
+ * `features` containing feature vectors stored as `Vector`s.
*
- * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and
+ * To use LIBSVM data source, you need to set "libsvm" as the format in `DataFrameReader` and
* optionally specify options, for example:
* {{{
* // Scala
@@ -51,6 +48,6 @@ import org.apache.spark.sql.{DataFrame, DataFrameReader}
* @note This class is public for documentation purpose. Please don't use this class directly.
* Rather, use the data source API as illustrated above.
*
- * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]]
+ * @see <a href="https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">LIBSVM datasets</a>
*/
class LibSVMDataSource private() {}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
index 0a0bc4c006..f3bace8181 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/GradientBoostedTrees.scala
@@ -34,7 +34,7 @@ private[spark] object GradientBoostedTrees extends Logging {
/**
* Method to train a gradient boosting model
- * @param input Training dataset: RDD of [[LabeledPoint]].
+ * @param input Training dataset: RDD of `LabeledPoint`.
* @param seed Random seed.
* @return tuple of ensemble models and weights:
* (array of decision tree models, array of model weights)
@@ -59,12 +59,12 @@ private[spark] object GradientBoostedTrees extends Logging {
/**
* Method to validate a gradient boosting model
- * @param input Training dataset: RDD of [[LabeledPoint]].
+ * @param input Training dataset: RDD of `LabeledPoint`.
* @param validationInput Validation dataset.
* This dataset should be different from the training dataset,
* but it should follow the same distribution.
* E.g., these two datasets could be created from an original dataset
- * by using [[org.apache.spark.rdd.RDD.randomSplit()]]
+ * by using `org.apache.spark.rdd.RDD.randomSplit()`
* @param seed Random seed.
* @return tuple of ensemble models and weights:
* (array of decision tree models, array of model weights)
@@ -162,7 +162,7 @@ private[spark] object GradientBoostedTrees extends Logging {
* Method to calculate error of the base learner for the gradient boosting calculation.
* Note: This method is not used by the gradient boosting algorithm but is useful for debugging
* purposes.
- * @param data Training dataset: RDD of [[LabeledPoint]].
+ * @param data Training dataset: RDD of `LabeledPoint`.
* @param trees Boosted Decision Tree models
* @param treeWeights Learning rates at each boosting iteration.
* @param loss evaluation metric.
@@ -184,7 +184,7 @@ private[spark] object GradientBoostedTrees extends Logging {
/**
* Method to compute error or loss for every iteration of gradient boosting.
*
- * @param data RDD of [[LabeledPoint]]
+ * @param data RDD of `LabeledPoint`
* @param trees Boosted Decision Tree models
* @param treeWeights Learning rates at each boosting iteration.
* @param loss evaluation metric.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index 8ae5ca3c84..a61ea374cb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -82,7 +82,7 @@ private[spark] object RandomForest extends Logging {
/**
* Train a random forest.
*
- * @param input Training data: RDD of [[LabeledPoint]]
+ * @param input Training data: RDD of `LabeledPoint`
* @return an unweighted set of trees
*/
def run(
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 5a551533be..40510ad804 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -342,9 +342,9 @@ private[ml] trait HasFeatureSubsetStrategy extends Params {
* - sqrt: recommended by Breiman manual for random forests
* - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
* package.
- * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf Breiman (2001)]]
- * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf Breiman manual for
- * random forests]]
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
+ * Breiman manual for random forests</a>
*
* @group param
*/
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 6ea52ef7f0..85191d46fd 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.types.StructType
*/
private[ml] trait CrossValidatorParams extends ValidatorParams {
/**
- * Param for number of folds for cross validation. Must be >= 2.
+ * Param for number of folds for cross validation. Must be &gt;= 2.
* Default: 3
*
* @group param
@@ -198,7 +198,7 @@ object CrossValidator extends MLReadable[CrossValidator] {
*
* @param bestModel The best model selected from k-fold cross validation.
* @param avgMetrics Average cross-validation metrics for each paramMap in
- * [[CrossValidator.estimatorParamMaps]], in the corresponding order.
+ * `CrossValidator.estimatorParamMaps`, in the corresponding order.
*/
@Since("1.2.0")
class CrossValidatorModel private[ml] (
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index e5fa5d53e3..5b7e5ec75c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -163,7 +163,7 @@ trait MLWritable {
/**
* :: DeveloperApi ::
*
- * Helper trait for making simple [[Params]] types writable. If a [[Params]] class stores
+ * Helper trait for making simple `Params` types writable. If a `Params` class stores
* all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide
* a default implementation of writing saved instances of the class.
* This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
@@ -231,7 +231,7 @@ trait MLReadable[T] {
/**
* :: DeveloperApi ::
*
- * Helper trait for making simple [[Params]] types readable. If a [[Params]] class stores
+ * Helper trait for making simple `Params` types readable. If a `Params` class stores
* all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide
* a default implementation of reading saved instances of the class.
* This only handles simple [[org.apache.spark.ml.param.Param]] types; e.g., it will not handle
@@ -360,7 +360,7 @@ private[ml] object DefaultParamsReader {
/**
* Get the JSON value of the [[org.apache.spark.ml.param.Param]] of the given name.
- * This can be useful for getting a Param value before an instance of [[Params]]
+ * This can be useful for getting a Param value before an instance of `Params`
* is available.
*/
def getParamValue(paramName: String): JValue = {
@@ -438,7 +438,7 @@ private[ml] object DefaultParamsReader {
}
/**
- * Load a [[Params]] instance from the given path, and return it.
+ * Load a `Params` instance from the given path, and return it.
* This assumes the instance implements [[MLReadable]].
*/
def loadParamsInstance[T](path: String, sc: SparkContext): T = {
@@ -454,7 +454,7 @@ private[ml] object DefaultParamsReader {
private[ml] object MetaAlgorithmReadWrite {
/**
* Examine the given estimator (which may be a compound estimator) and extract a mapping
- * from UIDs to corresponding [[Params]] instances.
+ * from UIDs to corresponding `Params` instances.
*/
def getUidMap(instance: Params): Map[String, Params] = {
val uidList = getUidMapImpl(instance)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 767d056861..fa46ba3ace 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -302,10 +302,11 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] {
/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
*
- * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of
- * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for
- * document classification. By making every vector a 0-1 vector, it can also be used as
- * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative.
+ * This is the Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>) which can
+ * handle all kinds of discrete data. For example, by converting documents into TF-IDF
+ * vectors, it can be used for document classification. By making every vector a 0-1 vector,
+ * it can also be used as Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>).
+ * The input feature values must be nonnegative.
*/
@Since("0.9.0")
class NaiveBayes private (
@@ -402,9 +403,9 @@ object NaiveBayes {
/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
*
- * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all
- * kinds of discrete data. For example, by converting documents into TF-IDF vectors, it
- * can be used for document classification.
+ * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>)
+ * which can handle all kinds of discrete data. For example, by converting documents into
+ * TF-IDF vectors, it can be used for document classification.
*
* This version of the method uses a default smoothing parameter of 1.0.
*
@@ -419,9 +420,9 @@ object NaiveBayes {
/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
*
- * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all
- * kinds of discrete data. For example, by converting documents into TF-IDF vectors, it
- * can be used for document classification.
+ * This is the default Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">here</a>)
+ * which can handle all kinds of discrete data. For example, by converting documents
+ * into TF-IDF vectors, it can be used for document classification.
*
* @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency
* vector or a count vector.
@@ -435,9 +436,10 @@ object NaiveBayes {
/**
* Trains a Naive Bayes model given an RDD of `(label, features)` pairs.
*
- * The model type can be set to either Multinomial NB ([[http://tinyurl.com/lsdw6p]])
- * or Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The Multinomial NB can handle
- * discrete count data and can be called by setting the model type to "multinomial".
+ * The model type can be set to either Multinomial NB (see <a href="http://tinyurl.com/lsdw6p">
+ * here</a>) or Bernoulli NB (see <a href="http://tinyurl.com/p7c96j6">here</a>).
+ * The Multinomial NB can handle discrete count data and can be called by setting the model
+ * type to "multinomial".
* For example, it can be used with word counts or TF_IDF vectors of documents.
* The Bernoulli model fits presence or absence (0-1) counts. By making every vector a
* 0-1 vector and setting the model type to "bernoulli", the fits and predicts as
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index e6b89712e2..31f5141752 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -43,13 +43,14 @@ import org.apache.spark.storage.StorageLevel
* @param k the desired number of leaf clusters (default: 4). The actual number could be smaller if
* there are no divisible leaf clusters.
* @param maxIterations the max number of k-means iterations to split clusters (default: 20)
- * @param minDivisibleClusterSize the minimum number of points (if >= 1.0) or the minimum proportion
- * of points (if < 1.0) of a divisible cluster (default: 1)
+ * @param minDivisibleClusterSize the minimum number of points (if &gt;= 1.0) or the minimum
+ * proportion of points (if &lt; 1.0) of a divisible cluster
+ * (default: 1)
* @param seed a random seed (default: hash value of the class name)
*
- * @see [[http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf
- * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
- * KDD Workshop on Text Mining, 2000.]]
+ * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
+ * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
+ * KDD Workshop on Text Mining, 2000.</a>
*/
@Since("1.6.0")
class BisectingKMeans private (
@@ -100,8 +101,8 @@ class BisectingKMeans private (
def getMaxIterations: Int = this.maxIterations
/**
- * Sets the minimum number of points (if >= `1.0`) or the minimum proportion of points
- * (if < `1.0`) of a divisible cluster (default: 1).
+ * Sets the minimum number of points (if &gt;= `1.0`) or the minimum proportion of points
+ * (if &lt; `1.0`) of a divisible cluster (default: 1).
*/
@Since("1.6.0")
def setMinDivisibleClusterSize(minDivisibleClusterSize: Double): this.type = {
@@ -112,8 +113,8 @@ class BisectingKMeans private (
}
/**
- * Gets the minimum number of points (if >= `1.0`) or the minimum proportion of points
- * (if < `1.0`) of a divisible cluster.
+ * Gets the minimum number of points (if &gt;= `1.0`) or the minimum proportion of points
+ * (if &lt; `1.0`) of a divisible cluster.
*/
@Since("1.6.0")
def getMinDivisibleClusterSize: Double = minDivisibleClusterSize
@@ -218,7 +219,7 @@ class BisectingKMeans private (
}
/**
- * Java-friendly version of [[run()]].
+ * Java-friendly version of `run()`.
*/
def run(data: JavaRDD[Vector]): BisectingKMeansModel = run(data.rdd)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
index 8438015cce..6f1ab091b2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeansModel.scala
@@ -71,7 +71,7 @@ class BisectingKMeansModel private[clustering] (
}
/**
- * Java-friendly version of [[predict()]].
+ * Java-friendly version of `predict()`.
*/
@Since("1.6.0")
def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
@@ -95,7 +95,7 @@ class BisectingKMeansModel private[clustering] (
}
/**
- * Java-friendly version of [[computeCost()]].
+ * Java-friendly version of `computeCost()`.
*/
@Since("1.6.0")
def computeCost(data: JavaRDD[Vector]): Double = this.computeCost(data.rdd)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 56cdeea5f7..6873d4277a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -234,7 +234,7 @@ class GaussianMixture private (
}
/**
- * Java-friendly version of [[run()]]
+ * Java-friendly version of `run()`
*/
@Since("1.3.0")
def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd)
@@ -273,8 +273,8 @@ class GaussianMixture private (
private[clustering] object GaussianMixture {
/**
- * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when
- * d > 25 except for when k is very small.
+ * Heuristic to distribute the computation of the `MultivariateGaussian`s, approximately when
+ * d &gt; 25 except for when k is very small.
* @param k Number of topics
* @param d Number of features
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index c30cc3e239..afbe4f978b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -80,7 +80,7 @@ class GaussianMixtureModel @Since("1.3.0") (
}
/**
- * Java-friendly version of [[predict()]]
+ * Java-friendly version of `predict()`
*/
@Since("1.4.0")
def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index 7c52abdeaa..16742bd284 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -39,8 +39,8 @@ import org.apache.spark.util.Utils
* - Original LDA paper (journal version):
* Blei, Ng, and Jordan. "Latent Dirichlet Allocation." JMLR, 2003.
*
- * @see [[http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation Latent Dirichlet allocation
- * (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation">
+ * Latent Dirichlet allocation (Wikipedia)</a>
*/
@Since("1.3.0")
class LDA private (
@@ -113,20 +113,20 @@ class LDA private (
*
* If set to a singleton vector Vector(-1), then docConcentration is set automatically. If set to
* singleton vector Vector(t) where t != -1, then t is replicated to a vector of length k during
- * [[LDAOptimizer.initialize()]]. Otherwise, the [[docConcentration]] vector must be length k.
+ * `LDAOptimizer.initialize()`. Otherwise, the [[docConcentration]] vector must be length k.
* (default = Vector(-1) = automatic)
*
* Optimizer-specific parameter settings:
* - EM
* - Currently only supports symmetric distributions, so all values in the vector should be
* the same.
- * - Values should be > 1.0
+ * - Values should be &gt; 1.0
* - default = uniformly (50 / k) + 1, where 50/k is common in LDA libraries and +1 follows
* from Asuncion et al. (2009), who recommend a +1 adjustment for EM.
* - Online
- * - Values should be >= 0
+ * - Values should be &gt;= 0
* - default = uniformly (1.0 / k), following the implementation from
- * [[https://github.com/Blei-Lab/onlineldavb]].
+ * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
*/
@Since("1.5.0")
def setDocConcentration(docConcentration: Vector): this.type = {
@@ -158,13 +158,13 @@ class LDA private (
def getAlpha: Double = getDocConcentration
/**
- * Alias for [[setDocConcentration()]]
+ * Alias for `setDocConcentration()`
*/
@Since("1.5.0")
def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)
/**
- * Alias for [[setDocConcentration()]]
+ * Alias for `setDocConcentration()`
*/
@Since("1.3.0")
def setAlpha(alpha: Double): this.type = setDocConcentration(alpha)
@@ -195,13 +195,13 @@ class LDA private (
*
* Optimizer-specific parameter settings:
* - EM
- * - Value should be > 1.0
+ * - Value should be &gt; 1.0
* - default = 0.1 + 1, where 0.1 gives a small amount of smoothing and +1 follows
* Asuncion et al. (2009), who recommend a +1 adjustment for EM.
* - Online
- * - Value should be >= 0
+ * - Value should be &gt;= 0
* - default = (1.0 / k), following the implementation from
- * [[https://github.com/Blei-Lab/onlineldavb]].
+ * <a href="https://github.com/Blei-Lab/onlineldavb">here</a>.
*/
@Since("1.3.0")
def setTopicConcentration(topicConcentration: Double): this.type = {
@@ -321,7 +321,7 @@ class LDA private (
* @param documents RDD of documents, which are term (word) count vectors paired with IDs.
* The term count vectors are "bags of words" with a fixed-size vocabulary
* (where the vocabulary size is the length of the vector).
- * Document IDs must be unique and >= 0.
+ * Document IDs must be unique and &gt;= 0.
* @return Inferred LDA model
*/
@Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index b5b0e64a2a..017fbc6feb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -171,7 +171,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* The term count vectors are "bags of words" with a fixed-size vocabulary
* (where the vocabulary size is the length of the vector).
* This must use the same vocabulary (ordering of term counts) as in training.
- * Document IDs must be unique and >= 0.
+ * Document IDs must be unique and &gt;= 0.
* @return Estimated topic distribution for each document.
* The returned RDD may be zipped with the given RDD, where each returned vector
* is a multinomial distribution over topics.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 7365ea1f20..9687fc8804 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -563,7 +563,7 @@ private[clustering] object OnlineLDAOptimizer {
*
* An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001)
* avoids explicit computation of variational parameter `phi`.
- * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
+ * @see <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566">here</a>
*
* @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` -
* statistics for updating lambda and `ids` - list of termCounts vector indices.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index c760ddd6ad..4d3e265455 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -36,7 +36,7 @@ import org.apache.spark.util.random.XORShiftRandom
* Model produced by [[PowerIterationClustering]].
*
* @param k number of clusters
- * @param assignments an RDD of clustering [[PowerIterationClustering#Assignment]]s
+ * @param assignments an RDD of clustering `PowerIterationClustering#Assignment`s
*/
@Since("1.3.0")
class PowerIterationClusteringModel @Since("1.3.0") (
@@ -103,9 +103,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
/**
* Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
- * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
- * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
- * similarity matrix of the data.
+ * <a href="http://www.icml2010.org/papers/387.pdf">Lin and Cohen</a>. From the abstract: PIC finds
+ * a very low-dimensional embedding of a dataset using truncated power iteration on a normalized
+ * pair-wise similarity matrix of the data.
*
* @param k Number of clusters.
* @param maxIterations Maximum number of iterations of the PIC algorithm.
@@ -113,7 +113,8 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
* as vertex properties, or "degree" to use normalized sum similarities.
* Default: random.
*
- * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Spectral_clustering">
+ * Spectral clustering (Wikipedia)</a>
*/
@Since("1.3.0")
class PowerIterationClustering private[clustering] (
@@ -210,7 +211,7 @@ class PowerIterationClustering private[clustering] (
}
/**
- * A Java-friendly version of [[PowerIterationClustering.run]].
+ * A Java-friendly version of `PowerIterationClustering.run`.
*/
@Since("1.3.0")
def run(similarities: JavaRDD[(java.lang.Long, java.lang.Long, java.lang.Double)])
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index f20ab09bf0..85c37c438d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -39,14 +39,14 @@ import org.apache.spark.util.random.XORShiftRandom
* generalized to incorporate forgetfullness (i.e. decay).
* The update rule (for each cluster) is:
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* c_t+1 &= [(c_t * n_t * a) + (x_t * m_t)] / [n_t + m_t] \\
* n_t+t &= n_t * a + m_t
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* Where c_t is the previously estimated centroid for that cluster,
* n_t is the number of points assigned to it thus far, x_t is the centroid
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
index 8f777cc35b..ad99b00a31 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala
@@ -74,7 +74,8 @@ class RegressionMetrics @Since("2.0.0") (
/**
* Returns the variance explained by regression.
* explainedVariance = $\sum_i (\hat{y_i} - \bar{y})^2^ / n$
- * @see [[https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained]]
+ * @see <a href="https://en.wikipedia.org/wiki/Fraction_of_variance_unexplained">
+ * Fraction of variance unexplained (Wikipedia)</a>
*/
@Since("1.2.0")
def explainedVariance: Double = {
@@ -110,10 +111,11 @@ class RegressionMetrics @Since("2.0.0") (
/**
* Returns R^2^, the unadjusted coefficient of determination.
- * @see [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
+ * @see <a href="http://en.wikipedia.org/wiki/Coefficient_of_determination">
+ * Coefficient of determination (Wikipedia)</a>
* In case of regression through the origin, the definition of R^2^ is to be modified.
- * @see J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)
- * [[https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf]]
+ * @see <a href="https://online.stat.psu.edu/~ajw13/stat501/SpecialTopics/Reg_thru_origin.pdf">
+ * J. G. Eisenhauer, Regression through the Origin. Teaching Statistics 25, 76-80 (2003)</a>
*/
@Since("1.2.0")
def r2: Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 0f7fbe9556..b533860122 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -147,18 +147,18 @@ object FPGrowthModel extends Loader[FPGrowthModel[_]] {
/**
* A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
- * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query
- * Recommendation]]. PFP distributes computation in such a way that each worker executes an
+ * <a href="http://dx.doi.org/10.1145/1454008.1454027">Li et al., PFP: Parallel FP-Growth for Query
+ * Recommendation</a>. PFP distributes computation in such a way that each worker executes an
* independent group of mining tasks. The FP-Growth algorithm is described in
- * [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate
- * generation]].
+ * <a href="http://dx.doi.org/10.1145/335191.335372">Han et al., Mining frequent patterns without
+ * candidate generation</a>.
*
* @param minSupport the minimal support level of the frequent pattern, any pattern that appears
* more than (minSupport * size-of-the-dataset) times will be output
* @param numPartitions number of partitions used by parallel FP-growth
*
- * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
- * (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Association_rule_learning">
+ * Association rule learning (Wikipedia)</a>
*
*/
@Since("1.3.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index 7382000791..a564167221 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -44,7 +44,8 @@ import org.apache.spark.storage.StorageLevel
/**
* A parallel PrefixSpan algorithm to mine frequent sequential patterns.
* The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns
- * Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]).
+ * Efficiently by Prefix-Projected Pattern Growth
+ * (see <a href="http://doi.org/10.1109/ICDE.2001.914830">here</a>).
*
* @param minSupport the minimal support level of the sequential pattern, any pattern that appears
* more than (minSupport * size-of-the-dataset) times will be output
@@ -55,8 +56,8 @@ import org.apache.spark.storage.StorageLevel
* processing. If a projected database exceeds this size, another
* iteration of distributed prefix growth is run.
*
- * @see [[https://en.wikipedia.org/wiki/Sequential_Pattern_Mining Sequential Pattern Mining
- * (Wikipedia)]]
+ * @see <a href="https://en.wikipedia.org/wiki/Sequential_Pattern_Mining">Sequential Pattern Mining
+ * (Wikipedia)</a>
*/
@Since("1.5.0")
class PrefixSpan private (
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 03866753b5..9e75217410 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -385,10 +385,10 @@ class BlockMatrix @Since("1.3.0") (
/**
* Adds the given block matrix `other` to `this` block matrix: `this + other`.
* The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock`
- * values. If one of the blocks that are being added are instances of [[SparseMatrix]],
- * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being added
- * to a [[DenseMatrix]]. If two dense matrices are added, the output will also be a
- * [[DenseMatrix]].
+ * values. If one of the blocks that are being added are instances of `SparseMatrix`,
+ * the resulting sub matrix will also be a `SparseMatrix`, even if it is being added
+ * to a `DenseMatrix`. If two dense matrices are added, the output will also be a
+ * `DenseMatrix`.
*/
@Since("1.3.0")
def add(other: BlockMatrix): BlockMatrix =
@@ -397,10 +397,10 @@ class BlockMatrix @Since("1.3.0") (
/**
* Subtracts the given block matrix `other` from `this` block matrix: `this - other`.
* The matrices must have the same size and matching `rowsPerBlock` and `colsPerBlock`
- * values. If one of the blocks that are being subtracted are instances of [[SparseMatrix]],
- * the resulting sub matrix will also be a [[SparseMatrix]], even if it is being subtracted
- * from a [[DenseMatrix]]. If two dense matrices are subtracted, the output will also be a
- * [[DenseMatrix]].
+ * values. If one of the blocks that are being subtracted are instances of `SparseMatrix`,
+ * the resulting sub matrix will also be a `SparseMatrix`, even if it is being subtracted
+ * from a `DenseMatrix`. If two dense matrices are subtracted, the output will also be a
+ * `DenseMatrix`.
*/
@Since("2.0.0")
def subtract(other: BlockMatrix): BlockMatrix =
@@ -447,8 +447,8 @@ class BlockMatrix @Since("1.3.0") (
/**
* Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
* of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
- * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
- * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
+ * `SparseMatrix`, they will have to be converted to a `DenseMatrix`. The output
+ * [[BlockMatrix]] will only consist of blocks of `DenseMatrix`. This may cause
* some performance issues until support for multiplying two sparse matrices is added.
*
* @note The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
index 008b03d1cc..d2c5b14a5b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala
@@ -101,14 +101,14 @@ class CoordinateMatrix @Since("1.0.0") (
toIndexedRowMatrix().toRowMatrix()
}
- /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+ /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */
@Since("1.3.0")
def toBlockMatrix(): BlockMatrix = {
toBlockMatrix(1024, 1024)
}
/**
- * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]].
+ * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
* @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
* a smaller value. Must be an integer value greater than 0.
* @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
index 809906a158..590e959daa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala
@@ -90,14 +90,14 @@ class IndexedRowMatrix @Since("1.0.0") (
new RowMatrix(rows.map(_.vector), 0L, nCols)
}
- /** Converts to BlockMatrix. Creates blocks of [[SparseMatrix]] with size 1024 x 1024. */
+ /** Converts to BlockMatrix. Creates blocks of `SparseMatrix` with size 1024 x 1024. */
@Since("1.3.0")
def toBlockMatrix(): BlockMatrix = {
toBlockMatrix(1024, 1024)
}
/**
- * Converts to BlockMatrix. Creates blocks of [[SparseMatrix]].
+ * Converts to BlockMatrix. Creates blocks of `SparseMatrix`.
* @param rowsPerBlock The number of rows of each block. The blocks at the bottom edge may have
* a smaller value. Must be an integer value greater than 0.
* @param colsPerBlock The number of columns of each block. The blocks at the right edge may have
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 4b120332ab..78a8810052 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -531,7 +531,7 @@ class RowMatrix @Since("1.0.0") (
* decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape.
* Reference:
* Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
- * architectures" ([[http://dx.doi.org/10.1145/1996092.1996103]])
+ * architectures" (see <a href="http://dx.doi.org/10.1145/1996092.1996103">here</a>)
*
* @param computeQ whether to computeQ
* @return QRDecomposition(Q, R), Q = null if computeQ = false.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
index c49e72646b..0efce3c76f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala
@@ -67,14 +67,14 @@ abstract class Gradient extends Serializable {
* http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
* multinomial logistic regression model. A simple calculation shows that
*
- * <p><blockquote>
+ * <blockquote>
* $$
* P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))\\
* P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))\\
* ...\\
* P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))\\
* $$
- * </blockquote></p>
+ * </blockquote>
*
* for K classes multiclass classification problem.
*
@@ -83,7 +83,7 @@ abstract class Gradient extends Serializable {
* will be (K-1) * N.
*
* As a result, the loss of objective function for a single instance of data can be written as
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* l(w, x) &= -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w) \\
@@ -91,7 +91,7 @@ abstract class Gradient extends Serializable {
* &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where $\alpha(i) = 1$ if $i \ne 0$, and
* $\alpha(i) = 0$ if $i == 0$,
@@ -100,7 +100,7 @@ abstract class Gradient extends Serializable {
* For optimization, we have to calculate the first derivative of the loss function, and
* a simple calculation shows that
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* \frac{\partial l(w, x)}{\partial w_{ij}} &=
@@ -108,7 +108,7 @@ abstract class Gradient extends Serializable {
* &= multiplier_i * x_j
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where $\delta_{i, j} = 1$ if $i == j$,
* $\delta_{i, j} = 0$ if $i != j$, and
@@ -118,12 +118,12 @@ abstract class Gradient extends Serializable {
* If any of margins is larger than 709.78, the numerical computation of multiplier and loss
* function will be suffered from arithmetic overflow. This issue occurs when there are outliers
* in data which are far away from hyperplane, and this will cause the failing of training once
- * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0.
+ * infinity / infinity is introduced. Note that this is only a concern when max(margins) &gt; 0.
*
- * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
+ * Fortunately, when max(margins) = maxMargin &gt; 0, the loss function and the multiplier can be
* easily rewritten into the following equivalent numerically stable formula.
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* l(w, x) &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} \\
@@ -132,7 +132,7 @@ abstract class Gradient extends Serializable {
* &= log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
* where sum = $\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1$.
*
@@ -141,7 +141,7 @@ abstract class Gradient extends Serializable {
*
* For multiplier, similar trick can be applied as the following,
*
- * <p><blockquote>
+ * <blockquote>
* $$
* \begin{align}
* multiplier
@@ -150,7 +150,7 @@ abstract class Gradient extends Serializable {
* &= \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
* \end{align}
* $$
- * </blockquote></p>
+ * </blockquote>
*
* where each term in $\exp$ is also smaller than zero, so overflow is not a concern.
*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 123e0bb3e6..67da88e804 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -88,10 +88,10 @@ class GradientDescent private[spark] (private var gradient: Gradient, private va
* convergenceTol is a condition which decides iteration termination.
* The end of iteration is decided based on below logic.
*
- * - If the norm of the new solution vector is >1, the diff of solution vectors
+ * - If the norm of the new solution vector is &gt;1, the diff of solution vectors
* is compared to relative tolerance which means normalizing by the norm of
* the new solution vector.
- * - If the norm of the new solution vector is <=1, the diff of solution vectors
+ * - If the norm of the new solution vector is &lt;=1, the diff of solution vectors
* is compared to absolute tolerance which is not normalizing.
*
* Must be between 0.0 and 1.0 inclusively.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
index e49363c2c6..6232ff30a7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala
@@ -31,7 +31,8 @@ import org.apache.spark.rdd.RDD
/**
* :: DeveloperApi ::
* Class used to solve an optimization problem using Limited-memory BFGS.
- * Reference: [[http://en.wikipedia.org/wiki/Limited-memory_BFGS]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Limited-memory_BFGS">
+ * Wikipedia on Limited-memory BFGS</a>
* @param gradient Gradient function to be used.
* @param updater Updater to be used to update weights after every iteration.
*/
@@ -48,8 +49,8 @@ class LBFGS(private var gradient: Gradient, private var updater: Updater)
* Set the number of corrections used in the LBFGS update. Default 10.
* Values of numCorrections less than 3 are not recommended; large values
* of numCorrections will result in excessive computing time.
- * 3 < numCorrections < 10 is recommended.
- * Restriction: numCorrections > 0
+ * 3 &lt; numCorrections &lt; 10 is recommended.
+ * Restriction: numCorrections &gt; 0
*/
def setNumCorrections(corrections: Int): this.type = {
require(corrections > 0,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
index 64d52bae00..b7c9fcfbfe 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/NNLS.scala
@@ -54,7 +54,7 @@ private[spark] object NNLS {
*
* We solve the problem
* min_x 1/2 x^T ata x^T - x^T atb
- * subject to x >= 0
+ * subject to x &gt;= 0
*
* The method used is similar to one described by Polyak (B. T. Polyak, The conjugate gradient
* method in extremal problems, Zh. Vychisl. Mat. Mat. Fiz. 9(4)(1969), pp. 94-112) for bound-
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
index 67d484575d..aa7dd1aaa6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
@@ -95,9 +95,9 @@ class SimpleUpdater extends Updater {
* The corresponding proximal operator for the L1 norm is the soft-thresholding
* function. That is, each weight component is shrunk towards 0 by shrinkageVal.
*
- * If w > shrinkageVal, set weight component to w-shrinkageVal.
- * If w < -shrinkageVal, set weight component to w+shrinkageVal.
- * If -shrinkageVal < w < shrinkageVal, set weight component to 0.
+ * If w &gt; shrinkageVal, set weight component to w-shrinkageVal.
+ * If w &lt; -shrinkageVal, set weight component to w+shrinkageVal.
+ * If -shrinkageVal &lt; w &lt; shrinkageVal, set weight component to 0.
*
* Equivalently, set weight component to signum(w) * max(0.0, abs(w) - shrinkageVal)
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/package.scala b/mllib/src/main/scala/org/apache/spark/mllib/package.scala
index 9810b6f668..8323afcb6a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/package.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/package.scala
@@ -32,7 +32,7 @@ package org.apache.spark
* to reach feature parity with the RDD-based APIs.
* And once we reach feature parity, this package will be deprecated.
*
- * @see [[https://issues.apache.org/jira/browse/SPARK-4591 SPARK-4591]] to track the progress of
- * feature parity
+ * @see <a href="https://issues.apache.org/jira/browse/SPARK-4591">SPARK-4591</a> to track
+ * the progress of feature parity
*/
package object mllib
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
index 005119616f..32e6ecf630 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -48,7 +48,7 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) extends Serializable {
}
/**
- * [[sliding(Int, Int)*]] with step = 1.
+ * `sliding(Int, Int)*` with step = 1.
*/
def sliding(windowSize: Int): RDD[Array[T]] = sliding(windowSize, 1)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 0039db7ecb..76b1bc13b4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -54,11 +54,12 @@ case class Rating @Since("0.8.0") (
*
* For implicit preference data, the algorithm used is based on
* "Collaborative Filtering for Implicit Feedback Datasets", available at
- * [[http://dx.doi.org/10.1109/ICDM.2008.22]], adapted for the blocked approach used here.
+ * <a href="http://dx.doi.org/10.1109/ICDM.2008.22">here</a>, adapted for the blocked approach
+ * used here.
*
* Essentially instead of finding the low-rank approximations to the rating matrix `R`,
* this finds the approximations for a preference matrix `P` where the elements of `P` are 1 if
- * r > 0 and 0 if r <= 0. The ratings then act as 'confidence' values related to strength of
+ * r &gt; 0 and 0 if r &lt;= 0. The ratings then act as 'confidence' values related to strength of
* indicated user
* preferences rather than explicit ratings given to items.
*/
@@ -282,7 +283,7 @@ class ALS private (
}
/**
- * Java-friendly version of [[ALS.run]].
+ * Java-friendly version of `ALS.run`.
*/
@Since("1.3.0")
def run(ratings: JavaRDD[Rating]): MatrixFactorizationModel = run(ratings.rdd)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 24e4dcccc8..23045fa2b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -146,7 +146,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
}
/**
- * Java-friendly version of [[MatrixFactorizationModel.predict]].
+ * Java-friendly version of `MatrixFactorizationModel.predict`.
*/
@Since("1.2.0")
def predict(usersProducts: JavaPairRDD[JavaInteger, JavaInteger]): JavaRDD[Rating] = {
@@ -195,7 +195,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
* - human-readable (JSON) model metadata to path/metadata/
* - Parquet formatted data to path/data/
*
- * The model may be loaded using [[Loader.load]].
+ * The model may be loaded using `Loader.load`.
*
* @param sc Spark context used to save model data.
* @param path Path specifying the directory in which to save this model.
@@ -320,7 +320,7 @@ object MatrixFactorizationModel extends Loader[MatrixFactorizationModel] {
/**
* Load a model from the given path.
*
- * The model should have been saved by [[Saveable.save]].
+ * The model should have been saved by `Saveable.save`.
*
* @param sc Spark context used for loading model files.
* @param path Path specifying the directory to which the model was saved.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 377326f873..36894d5234 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -238,23 +238,22 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] {
* Sequential PAV implementation based on:
* Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
* "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61.
- * Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]]
+ * Available from <a href="http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf">here</a>
*
* Sequential PAV parallelization based on:
* Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
* "An approach to parallelizing isotonic regression."
* Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
- * Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]]
+ * Available from <a href="http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf">here</a>
*
- * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
+ * @see <a href="http://en.wikipedia.org/wiki/Isotonic_regression">Isotonic regression
+ * (Wikipedia)</a>
*/
@Since("1.3.0")
class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
/**
* Constructs IsotonicRegression instance with default parameter isotonic = true.
- *
- * @return New instance of IsotonicRegression.
*/
@Since("1.3.0")
def this() = this(true)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 7a2a7a35a9..7dc0c459ec 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -30,12 +30,15 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
* the corresponding joint dataset.
*
* A numerically stable algorithm is implemented to compute the mean and variance of instances:
- * Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
+ * Reference: <a href="http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance">
+ * variance-wiki</a>
* Zero elements (including explicit zero values) are skipped when calling add(),
* to have time complexity O(nnz) instead of O(n) for each column.
*
* For weighted instances, the unbiased estimation of variance is defined by the reliability
- * weights: [[https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights]].
+ * weights:
+ * see <a href="https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights">
+ * Reliability weights (Wikipedia)</a>.
*/
@Since("1.1.0")
@DeveloperApi
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 925fdf4d7e..7ba9b29296 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -88,7 +88,7 @@ object Statistics {
def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
/**
- * Java-friendly version of [[corr()]]
+ * Java-friendly version of `corr()`
*/
@Since("1.4.1")
def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
@@ -112,7 +112,7 @@ object Statistics {
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
/**
- * Java-friendly version of [[corr()]]
+ * Java-friendly version of `corr()`
*/
@Since("1.4.1")
def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
@@ -176,7 +176,7 @@ object Statistics {
ChiSqTest.chiSquaredFeatures(data)
}
- /** Java-friendly version of [[chiSqTest()]] */
+ /** Java-friendly version of `chiSqTest()` */
@Since("1.5.0")
def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = chiSqTest(data.rdd)
@@ -186,7 +186,8 @@ object Statistics {
* distribution of the sample data and the theoretical distribution we can provide a test for the
* the null hypothesis that the sample data comes from that theoretical distribution.
* For more information on KS Test:
- * @see [[https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test]]
+ * @see <a href="https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test">
+ * Kolmogorov-Smirnov test (Wikipedia)</a>
*
* @param data an `RDD[Double]` containing the sample of data to test
* @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value
@@ -217,7 +218,7 @@ object Statistics {
KolmogorovSmirnovTest.testOneSample(data, distName, params: _*)
}
- /** Java-friendly version of [[kolmogorovSmirnovTest()]] */
+ /** Java-friendly version of `kolmogorovSmirnovTest()` */
@Since("1.5.0")
@varargs
def kolmogorovSmirnovTest(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index 39c3644450..4cf662e036 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -28,7 +28,8 @@ import org.apache.spark.mllib.util.MLUtils
* This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
* the event that the covariance matrix is singular, the density will be computed in a
* reduced dimensional subspace under which the distribution is supported.
- * (see [[http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case]])
+ * (see <a href="http://en.wikipedia.org/wiki/Multivariate_normal_distribution#Degenerate_case">
+ * Degenerate case in Multivariate normal distribution (Wikipedia)</a>)
*
* @param mu The mean vector of the distribution
* @param sigma The covariance matrix of the distribution
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index ece1e41d98..cdeef16135 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
/**
* A class that implements
- * [[http://en.wikipedia.org/wiki/Gradient_boosting Stochastic Gradient Boosting]]
+ * <a href="http://en.wikipedia.org/wiki/Gradient_boosting">Stochastic Gradient Boosting</a>
* for regression and binary classification.
*
* The implementation is based upon:
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index 14f11ce51b..428af21406 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -36,7 +36,7 @@ import org.apache.spark.util.Utils
/**
- * A class that implements a [[http://en.wikipedia.org/wiki/Random_forest Random Forest]]
+ * A class that implements a <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
* learning algorithm for classification and regression.
* It supports both continuous and categorical features.
*
@@ -46,9 +46,9 @@ import org.apache.spark.util.Utils
* - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
* package.
*
- * @see [[http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf Breiman (2001)]]
- * @see [[http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf Breiman manual for
- * random forests]]
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
+ * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
+ * Breiman manual for random forests</a>
* @param strategy The configuration parameters for the random forest algorithm which specify
* the type of random forest (classification or regression), feature type
* (continuous, categorical), depth of the tree, quantile calculation strategy,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
index 5cef9d0631..be2704df34 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/Split.scala
@@ -25,7 +25,7 @@ import org.apache.spark.mllib.tree.configuration.FeatureType.FeatureType
* Split applied to a feature
* @param feature feature index
* @param threshold Threshold for continuous feature.
- * Split left if feature <= threshold, else right.
+ * Split left if feature &lt;= threshold, else right.
* @param featureType type of feature -- categorical or continuous
* @param categories Split left if categorical feature value is in this set, else right.
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index e96c2bc6ed..6bb3271aac 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -213,7 +213,7 @@ object MLUtils extends Logging {
}
/**
- * Version of [[kFold()]] taking a Long seed.
+ * Version of `kFold()` taking a Long seed.
*/
@Since("2.0.0")
def kFold[T: ClassTag](rdd: RDD[T], numFolds: Int, seed: Long): Array[(RDD[T], RDD[T])] = {
@@ -262,7 +262,7 @@ object MLUtils extends Logging {
* @param dataset input dataset
* @param cols a list of vector columns to be converted. New vector columns will be ignored. If
* unspecified, all old vector columns will be converted except nested ones.
- * @return the input [[DataFrame]] with old vector columns converted to the new vector type
+ * @return the input `DataFrame` with old vector columns converted to the new vector type
*/
@Since("2.0.0")
@varargs
@@ -314,7 +314,7 @@ object MLUtils extends Logging {
* @param dataset input dataset
* @param cols a list of vector columns to be converted. Old vector columns will be ignored. If
* unspecified, all new vector columns will be converted except nested ones.
- * @return the input [[DataFrame]] with new vector columns converted to the old vector type
+ * @return the input `DataFrame` with new vector columns converted to the old vector type
*/
@Since("2.0.0")
@varargs
@@ -366,7 +366,7 @@ object MLUtils extends Logging {
* @param dataset input dataset
* @param cols a list of matrix columns to be converted. New matrix columns will be ignored. If
* unspecified, all old matrix columns will be converted except nested ones.
- * @return the input [[DataFrame]] with old matrix columns converted to the new matrix type
+ * @return the input `DataFrame` with old matrix columns converted to the new matrix type
*/
@Since("2.0.0")
@varargs
@@ -416,7 +416,7 @@ object MLUtils extends Logging {
* @param dataset input dataset
* @param cols a list of matrix columns to be converted. Old matrix columns will be ignored. If
* unspecified, all new matrix columns will be converted except nested ones.
- * @return the input [[DataFrame]] with new matrix columns converted to the old matrix type
+ * @return the input `DataFrame` with new matrix columns converted to the old matrix type
*/
@Since("2.0.0")
@varargs
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
index c881c8ea50..da0eb04764 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/modelSaveLoad.scala
@@ -72,7 +72,7 @@ trait Loader[M <: Saveable] {
/**
* Load a model from the given path.
*
- * The model should have been saved by [[Saveable.save]].
+ * The model should have been saved by `Saveable.save`.
*
* @param sc Spark context used for loading model files.
* @param path Path specifying the directory to which the model was saved.