aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorBimal Tandel <bimal@bimal-MBP.local>2015-07-29 16:54:58 -0700
committerXiangrui Meng <meng@databricks.com>2015-07-29 16:54:58 -0700
commit103d8cce78533b38b4f8060b30f7f455113bc6b5 (patch)
tree7aa98f7b68e9b17d729ca24b20eb7046a6aebd22 /mllib
parent86505962e6c9da1ee18c6a3533e169a22e4f1665 (diff)
downloadspark-103d8cce78533b38b4f8060b30f7f455113bc6b5.tar.gz
spark-103d8cce78533b38b4f8060b30f7f455113bc6b5.tar.bz2
spark-103d8cce78533b38b4f8060b30f7f455113bc6b5.zip
[SPARK-8921] [MLLIB] Add @since tags to mllib.stat
Author: Bimal Tandel <bimal@bimal-MBP.local> Closes #7730 from BimalTandel/branch_spark_8921 and squashes the following commits: 3ea230a [Bimal Tandel] Spark 8921 add @since tags
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala27
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala20
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala9
5 files changed, 66 insertions, 4 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
index 58a50f9c19..93a6753efd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/KernelDensity.scala
@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD
* .setBandwidth(3.0)
* val densities = kd.estimate(Array(-1.0, 2.0, 5.0))
* }}}
+ * @since 1.4.0
*/
@Experimental
class KernelDensity extends Serializable {
@@ -51,6 +52,7 @@ class KernelDensity extends Serializable {
/**
* Sets the bandwidth (standard deviation) of the Gaussian kernel (default: `1.0`).
+ * @since 1.4.0
*/
def setBandwidth(bandwidth: Double): this.type = {
require(bandwidth > 0, s"Bandwidth must be positive, but got $bandwidth.")
@@ -60,6 +62,7 @@ class KernelDensity extends Serializable {
/**
* Sets the sample to use for density estimation.
+ * @since 1.4.0
*/
def setSample(sample: RDD[Double]): this.type = {
this.sample = sample
@@ -68,6 +71,7 @@ class KernelDensity extends Serializable {
/**
* Sets the sample to use for density estimation (for Java users).
+ * @since 1.4.0
*/
def setSample(sample: JavaRDD[java.lang.Double]): this.type = {
this.sample = sample.rdd.asInstanceOf[RDD[Double]]
@@ -76,6 +80,7 @@ class KernelDensity extends Serializable {
/**
* Estimates probability density function at the given array of points.
+ * @since 1.4.0
*/
def estimate(points: Array[Double]): Array[Double] = {
val sample = this.sample
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index d321cc554c..62da9f2ef2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector}
* Reference: [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]
* Zero elements (including explicit zero values) are skipped when calling add(),
* to have time complexity O(nnz) instead of O(n) for each column.
+ * @since 1.1.0
*/
@DeveloperApi
class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with Serializable {
@@ -52,6 +53,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
*
* @param sample The sample in dense/sparse vector format to be added into this summarizer.
* @return This MultivariateOnlineSummarizer object.
+ * @since 1.1.0
*/
def add(sample: Vector): this.type = {
if (n == 0) {
@@ -107,6 +109,7 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
*
* @param other The other MultivariateOnlineSummarizer to be merged.
* @return This MultivariateOnlineSummarizer object.
+ * @since 1.1.0
*/
def merge(other: MultivariateOnlineSummarizer): this.type = {
if (this.totalCnt != 0 && other.totalCnt != 0) {
@@ -149,6 +152,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
this
}
+ /**
+ * @since 1.1.0
+ */
override def mean: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -161,6 +167,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(realMean)
}
+ /**
+ * @since 1.1.0
+ */
override def variance: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -183,14 +192,23 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(realVariance)
}
+ /**
+ * @since 1.1.0
+ */
override def count: Long = totalCnt
+ /**
+ * @since 1.1.0
+ */
override def numNonzeros: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
Vectors.dense(nnz)
}
+ /**
+ * @since 1.1.0
+ */
override def max: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -202,6 +220,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(currMax)
}
+ /**
+ * @since 1.1.0
+ */
override def min: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -213,6 +234,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(currMin)
}
+ /**
+ * @since 1.2.0
+ */
override def normL2: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
@@ -227,6 +251,9 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S
Vectors.dense(realMagnitude)
}
+ /**
+ * @since 1.2.0
+ */
override def normL1: Vector = {
require(totalCnt > 0, s"Nothing has been added to this summarizer.")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
index 6a364c9328..3bb49f1228 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala
@@ -21,46 +21,55 @@ import org.apache.spark.mllib.linalg.Vector
/**
* Trait for multivariate statistical summary of a data matrix.
+ * @since 1.0.0
*/
trait MultivariateStatisticalSummary {
/**
* Sample mean vector.
+ * @since 1.0.0
*/
def mean: Vector
/**
* Sample variance vector. Should return a zero vector if the sample size is 1.
+ * @since 1.0.0
*/
def variance: Vector
/**
* Sample size.
+ * @since 1.0.0
*/
def count: Long
/**
* Number of nonzero elements (including explicitly presented zero values) in each column.
+ * @since 1.0.0
*/
def numNonzeros: Vector
/**
* Maximum value of each column.
+ * @since 1.0.0
*/
def max: Vector
/**
* Minimum value of each column.
+ * @since 1.0.0
*/
def min: Vector
/**
* Euclidean magnitude of each column
+ * @since 1.2.0
*/
def normL2: Vector
/**
* L1 norm of each column
+ * @since 1.2.0
*/
def normL1: Vector
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 90332028cf..f84502919e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -32,6 +32,7 @@ import org.apache.spark.rdd.RDD
/**
* :: Experimental ::
* API for statistical functions in MLlib.
+ * @since 1.1.0
*/
@Experimental
object Statistics {
@@ -41,6 +42,7 @@ object Statistics {
*
* @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
* @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
+ * @since 1.1.0
*/
def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
new RowMatrix(X).computeColumnSummaryStatistics()
@@ -52,6 +54,7 @@ object Statistics {
*
* @param X an RDD[Vector] for which the correlation matrix is to be computed.
* @return Pearson correlation matrix comparing columns in X.
+ * @since 1.1.0
*/
def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
@@ -68,6 +71,7 @@ object Statistics {
* @param method String specifying the method to use for computing correlation.
* Supported: `pearson` (default), `spearman`
* @return Correlation matrix comparing columns in X.
+ * @since 1.1.0
*/
def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
@@ -81,10 +85,14 @@ object Statistics {
* @param x RDD[Double] of the same cardinality as y.
* @param y RDD[Double] of the same cardinality as x.
* @return A Double containing the Pearson correlation between the two input RDD[Double]s
+ * @since 1.1.0
*/
def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
- /** Java-friendly version of [[corr()]] */
+ /**
+ * Java-friendly version of [[corr()]]
+ * @since 1.4.1
+ */
def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double]): Double =
corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]])
@@ -101,10 +109,14 @@ object Statistics {
* Supported: `pearson` (default), `spearman`
* @return A Double containing the correlation between the two input RDD[Double]s using the
* specified method.
+ * @since 1.1.0
*/
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
- /** Java-friendly version of [[corr()]] */
+ /**
+ * Java-friendly version of [[corr()]]
+ * @since 1.4.1
+ */
def corr(x: JavaRDD[java.lang.Double], y: JavaRDD[java.lang.Double], method: String): Double =
corr(x.rdd.asInstanceOf[RDD[Double]], y.rdd.asInstanceOf[RDD[Double]], method)
@@ -121,6 +133,7 @@ object Statistics {
* `expected` is rescaled if the `expected` sum differs from the `observed` sum.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
+ * @since 1.1.0
*/
def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
ChiSqTest.chiSquared(observed, expected)
@@ -135,6 +148,7 @@ object Statistics {
* @param observed Vector containing the observed categorical counts/relative frequencies.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
+ * @since 1.1.0
*/
def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
@@ -145,6 +159,7 @@ object Statistics {
* @param observed The contingency matrix (containing either counts or relative frequencies).
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
+ * @since 1.1.0
*/
def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
@@ -157,6 +172,7 @@ object Statistics {
* Real-valued features will be treated as categorical for each distinct value.
* @return an array containing the ChiSquaredTestResult for every feature against the label.
* The order of the elements in the returned array reflects the order of input features.
+ * @since 1.1.0
*/
def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
ChiSqTest.chiSquaredFeatures(data)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index cf51b24ff7..9aa7763d78 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -32,6 +32,7 @@ import org.apache.spark.mllib.util.MLUtils
*
* @param mu The mean vector of the distribution
* @param sigma The covariance matrix of the distribution
+ * @since 1.3.0
*/
@DeveloperApi
class MultivariateGaussian (
@@ -60,12 +61,16 @@ class MultivariateGaussian (
*/
private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
- /** Returns density of this multivariate Gaussian at given point, x */
+ /** Returns density of this multivariate Gaussian at given point, x
+ * @since 1.3.0
+ */
def pdf(x: Vector): Double = {
pdf(x.toBreeze)
}
- /** Returns the log-density of this multivariate Gaussian at given point, x */
+ /** Returns the log-density of this multivariate Gaussian at given point, x
+ * @since 1.3.0
+ */
def logpdf(x: Vector): Double = {
logpdf(x.toBreeze)
}