aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-02-04 23:03:47 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-04 23:03:47 -0800
commitdb34690466d67f9c8ac6a145fddb5f7ea30a8d8d (patch)
tree1646d1f386634a605a7403667aa653ec9ed41f69
parent975bcef467b35586e5224171071355409f451d2d (diff)
downloadspark-db34690466d67f9c8ac6a145fddb5f7ea30a8d8d.tar.gz
spark-db34690466d67f9c8ac6a145fddb5f7ea30a8d8d.tar.bz2
spark-db34690466d67f9c8ac6a145fddb5f7ea30a8d8d.zip
[SPARK-5599] Check MLlib public APIs for 1.3
There are no break changes (against 1.2) in this PR. I hide the PythonMLLibAPI, which is only called by Py4J, and renamed `SparseMatrix.diag` to `SparseMatrix.spdiag`. All other changes are documentation and annotations. The `Experimental` tag is removed from `ALS.setAlpha` and `Rating`. One issue not addressed in this PR is the `setCheckpointDir` in `LDA` (https://issues.apache.org/jira/browse/SPARK-5604). CC: srowen jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #4377 from mengxr/SPARK-5599 and squashes the following commits: 17975dc [Xiangrui Meng] fix tests 4487f20 [Xiangrui Meng] remove experimental tag from each stat method because Statistics is experimental already 3cd969a [Xiangrui Meng] remove freeman (sorry~) from StreamLA public doc 55900f5 [Xiangrui Meng] make IR experimental and update its doc 9b8eed3 [Xiangrui Meng] graduate Rating and setAlpha in ALS b854d28 [Xiangrui Meng] correct iid doc in RandomRDDs 27f5bdd [Xiangrui Meng] update linalg docs and some new method signatures 371721b [Xiangrui Meng] mark fpg as experimental and update its doc 8aca7ee [Xiangrui Meng] change SLR to experimental and update the doc ebbb2e9 [Xiangrui Meng] mark PIC experimental and update the doc 7830d3b [Xiangrui Meng] mark GMM experimental a378496 [Xiangrui Meng] use the correct subscript syntax in PIC c65c424 [Xiangrui Meng] update LDAModel doc a213b0c [Xiangrui Meng] update GMM constructor 3993054 [Xiangrui Meng] hide algorithm in SLR ad6b9ce [Xiangrui Meng] Revert "make ClassificatinModel.predict(JavaRDD) return JavaDoubleRDD" 0054684 [Xiangrui Meng] add doc to LRModel's constructor a89763b [Xiangrui Meng] make ClassificatinModel.predict(JavaRDD) return JavaDoubleRDD 7c0946c [Xiangrui Meng] hide PythonMLLibAPI
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala26
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala16
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala27
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala48
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala57
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala21
-rw-r--r--mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java4
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala8
19 files changed, 160 insertions, 119 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 3f29b82ce8..cbd87ea8ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -54,11 +54,9 @@ import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.Utils
/**
- * :: DeveloperApi ::
- * The Java stubs necessary for the Python mllib bindings.
+ * The Java stubs necessary for the Python mllib bindings. It is called by Py4J on the Python side.
*/
-@DeveloperApi
-class PythonMLLibAPI extends Serializable {
+private[python] class PythonMLLibAPI extends Serializable {
/**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 5c9feb6fb2..a668e7a7a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -62,6 +62,9 @@ class LogisticRegressionModel (
s" but was given weights of length ${weights.size}")
}
+ /**
+ * Constructs a [[LogisticRegressionModel]] with weights and intercept for binary classification.
+ */
def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)
private var threshold: Option[Double] = Some(0.5)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
index 6a3893d0e4..b89f38cf5a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
@@ -35,12 +35,13 @@ import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
* Use a builder pattern to construct a streaming logistic regression
* analysis in an application, like:
*
+ * {{{
* val model = new StreamingLogisticRegressionWithSGD()
* .setStepSize(0.5)
* .setNumIterations(10)
* .setInitialWeights(Vectors.dense(...))
* .trainOn(DStream)
- *
+ * }}}
*/
@Experimental
class StreamingLogisticRegressionWithSGD private[mllib] (
@@ -59,7 +60,7 @@ class StreamingLogisticRegressionWithSGD private[mllib] (
*/
def this() = this(0.1, 50, 1.0, 0.0)
- val algorithm = new LogisticRegressionWithSGD(
+ protected val algorithm = new LogisticRegressionWithSGD(
stepSize, numIterations, regParam, miniBatchFraction)
/** Set the step size for gradient descent. Default: 0.1. */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 5c626fde4e..0be3014de8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -19,15 +19,18 @@ package org.apache.spark.mllib.clustering
import scala.collection.mutable.IndexedSeq
-import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose}
+import breeze.linalg.{DenseMatrix => BreezeMatrix, DenseVector => BreezeVector, Transpose, diag}
-import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS}
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, Matrices, Vector, Vectors}
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
/**
+ * :: Experimental ::
+ *
* This class performs expectation maximization for multivariate Gaussian
* Mixture Models (GMMs). A GMM represents a composite distribution of
* independent Gaussian distributions with associated "mixing" weights
@@ -44,13 +47,17 @@ import org.apache.spark.util.Utils
* is considered to have occurred.
* @param maxIterations The maximum number of iterations to perform
*/
+@Experimental
class GaussianMixture private (
private var k: Int,
private var convergenceTol: Double,
private var maxIterations: Int,
private var seed: Long) extends Serializable {
- /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
+ /**
+ * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
+ * maxIterations: 100, seed: random}.
+ */
def this() = this(2, 0.01, 100, Utils.random.nextLong())
// number of samples per cluster to use when initializing Gaussians
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 1a2178ee7f..af6f83c74b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -19,12 +19,15 @@ package org.apache.spark.mllib.clustering
import breeze.linalg.{DenseVector => BreezeVector}
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
/**
+ * :: Experimental ::
+ *
* Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
* are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
* the respective mean and covariance for each Gaussian distribution i=1..k.
@@ -35,6 +38,7 @@ import org.apache.spark.mllib.util.MLUtils
* @param sigma Covariance maxtrix for each Gaussian in the mixture, where sigma(i) is the
* covariance matrix for Gaussian i
*/
+@Experimental
class GaussianMixtureModel(
val weights: Array[Double],
val gaussians: Array[MultivariateGaussian]) extends Serializable {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 19e8aab6ea..b0e991d2f2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -335,7 +335,7 @@ class DistributedLDAModel private (
/**
* For each document in the training set, return the distribution over topics for that document
- * (i.e., "theta_doc").
+ * ("theta_doc").
*
* @return RDD of (document ID, topic distribution) pairs
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 9b5c155b0a..3b1caf0c67 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -18,6 +18,7 @@
package org.apache.spark.mllib.clustering
import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.annotation.Experimental
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl
import org.apache.spark.mllib.linalg.Vectors
@@ -26,25 +27,33 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.util.random.XORShiftRandom
/**
+ * :: Experimental ::
+ *
* Model produced by [[PowerIterationClustering]].
*
* @param k number of clusters
* @param assignments an RDD of (vertexID, clusterID) pairs
*/
+@Experimental
class PowerIterationClusteringModel(
val k: Int,
val assignments: RDD[(Long, Int)]) extends Serializable
/**
- * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by Lin and
- * Cohen (see http://www.icml2010.org/papers/387.pdf). From the abstract: PIC finds a very
+ * :: Experimental ::
+ *
+ * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
+ * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
* low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
* similarity matrix of the data.
*
* @param k Number of clusters.
* @param maxIterations Maximum number of iterations of the PIC algorithm.
* @param initMode Initialization mode.
+ *
+ * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
*/
+@Experimental
class PowerIterationClustering private[clustering] (
private var k: Int,
private var maxIterations: Int,
@@ -88,11 +97,12 @@ class PowerIterationClustering private[clustering] (
/**
* Run the PIC algorithm.
*
- * @param similarities an RDD of (i, j, s_ij_) tuples representing the affinity matrix, which is
- * the matrix A in the PIC paper. The similarity s_ij_ must be nonnegative.
- * This is a symmetric matrix and hence s_ij_ = s_ji_. For any (i, j) with
- * nonzero similarity, there should be either (i, j, s_ij_) or (j, i, s_ji_)
- * in the input. Tuples with i = j are ignored, because we assume s_ij_ = 0.0.
+ * @param similarities an RDD of (i, j, s,,ij,,) tuples representing the affinity matrix, which is
+ * the matrix A in the PIC paper. The similarity s,,ij,, must be nonnegative.
+ * This is a symmetric matrix and hence s,,ij,, = s,,ji,,. For any (i, j) with
+ * nonzero similarity, there should be either (i, j, s,,ij,,) or
+ * (j, i, s,,ji,,) in the input. Tuples with i = j are ignored, because we
+ * assume s,,ij,, = 0.0.
*
* @return a [[PowerIterationClusteringModel]] that contains the clustering result
*/
@@ -109,7 +119,7 @@ class PowerIterationClustering private[clustering] (
* Runs the PIC algorithm.
*
* @param w The normalized affinity matrix, which is the matrix W in the PIC paper with
- * w_ij_ = a_ij_ / d_ii_ as its edge properties and the initial vector of the power
+ * w,,ij,, = a,,ij,, / d,,ii,, as its edge properties and the initial vector of the power
* iteration as its vertex properties.
*/
private def pic(w: Graph[Double, Double]): PowerIterationClusteringModel = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 7752c1988f..f483fd1c7d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -21,7 +21,7 @@ import scala.reflect.ClassTag
import org.apache.spark.Logging
import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Experimental, DeveloperApi}
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
@@ -29,7 +29,8 @@ import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom
/**
- * :: DeveloperApi ::
+ * :: Experimental ::
+ *
* StreamingKMeansModel extends MLlib's KMeansModel for streaming
* algorithms, so it can keep track of a continuously updated weight
* associated with each cluster, and also update the model by
@@ -39,8 +40,10 @@ import org.apache.spark.util.random.XORShiftRandom
* generalized to incorporate forgetfullness (i.e. decay).
* The update rule (for each cluster) is:
*
+ * {{{
* c_t+1 = [(c_t * n_t * a) + (x_t * m_t)] / [n_t + m_t]
* n_t+t = n_t * a + m_t
+ * }}}
*
* Where c_t is the previously estimated centroid for that cluster,
* n_t is the number of points assigned to it thus far, x_t is the centroid
@@ -61,7 +64,7 @@ import org.apache.spark.util.random.XORShiftRandom
* as batches or points.
*
*/
-@DeveloperApi
+@Experimental
class StreamingKMeansModel(
override val clusterCenters: Array[Vector],
val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging {
@@ -140,7 +143,8 @@ class StreamingKMeansModel(
}
/**
- * :: DeveloperApi ::
+ * :: Experimental ::
+ *
* StreamingKMeans provides methods for configuring a
* streaming k-means analysis, training the model on streaming,
* and using the model to make predictions on streaming data.
@@ -149,13 +153,15 @@ class StreamingKMeansModel(
* Use a builder pattern to construct a streaming k-means analysis
* in an application, like:
*
+ * {{{
* val model = new StreamingKMeans()
* .setDecayFactor(0.5)
* .setK(3)
* .setRandomCenters(5, 100.0)
* .trainOn(DStream)
+ * }}}
*/
-@DeveloperApi
+@Experimental
class StreamingKMeans(
var k: Int,
var decayFactor: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 1433ee9a0d..3168d608c9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -25,16 +25,20 @@ import scala.collection.JavaConverters._
import scala.reflect.ClassTag
import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException}
+import org.apache.spark.annotation.Experimental
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
/**
+ * :: Experimental ::
+ *
* Model trained by [[FPGrowth]], which holds frequent itemsets.
* @param freqItemsets frequent itemset, which is an RDD of (itemset, frequency) pairs
* @tparam Item item type
*/
+@Experimental
class FPGrowthModel[Item: ClassTag](
val freqItemsets: RDD[(Array[Item], Long)]) extends Serializable {
@@ -45,28 +49,35 @@ class FPGrowthModel[Item: ClassTag](
}
/**
- * This class implements Parallel FP-growth algorithm to do frequent pattern matching on input data.
- * Parallel FPGrowth (PFP) partitions computation in such a way that each machine executes an
- * independent group of mining tasks. More detail of this algorithm can be found at
- * [[http://dx.doi.org/10.1145/1454008.1454027, PFP]], and the original FP-growth paper can be
- * found at [[http://dx.doi.org/10.1145/335191.335372, FP-growth]]
+ * :: Experimental ::
+ *
+ * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
+ * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query
+ * Recommendation]]. PFP distributes computation in such a way that each worker executes an
+ * independent group of mining tasks. The FP-Growth algorithm is described in
+ * [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate
+ * generation]].
*
* @param minSupport the minimal support level of the frequent pattern, any pattern appears
* more than (minSupport * size-of-the-dataset) times will be output
* @param numPartitions number of partitions used by parallel FP-growth
+ *
+ * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
+ * (Wikipedia)]]
*/
+@Experimental
class FPGrowth private (
private var minSupport: Double,
private var numPartitions: Int) extends Logging with Serializable {
/**
- * Constructs a FPGrowth instance with default parameters:
- * {minSupport: 0.3, numPartitions: auto}
+ * Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same
+ * as the input data}.
*/
def this() = this(0.3, -1)
/**
- * Sets the minimal support level (default: 0.3).
+ * Sets the minimal support level (default: `0.3`).
*/
def setMinSupport(minSupport: Double): this.type = {
this.minSupport = minSupport
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index ad7e86827b..84f8ac2e0d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -115,7 +115,7 @@ sealed trait Matrix extends Serializable {
*
* @param numRows number of rows
* @param numCols number of columns
- * @param values matrix entries in column major
+ * @param values matrix entries in column major if not transposed or in row major otherwise
* @param isTransposed whether the matrix is transposed. If true, `values` stores the matrix in
* row major.
*/
@@ -187,7 +187,7 @@ class DenseMatrix(
this
}
- override def transpose: Matrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
+ override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
if (!isTransposed) {
@@ -217,9 +217,11 @@ class DenseMatrix(
}
}
- /** Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
- * set to false. */
- def toSparse(): SparseMatrix = {
+ /**
+ * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
+ * set to false.
+ */
+ def toSparse: SparseMatrix = {
val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
val colPtrs: Array[Int] = new Array[Int](numCols + 1)
val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt
@@ -282,7 +284,7 @@ object DenseMatrix {
}
/**
- * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
+ * Generate a `DenseMatrix` consisting of `i.i.d.` uniform random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param rng a random number generator
@@ -293,7 +295,7 @@ object DenseMatrix {
}
/**
- * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
+ * Generate a `DenseMatrix` consisting of `i.i.d.` gaussian random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param rng a random number generator
@@ -336,10 +338,10 @@ object DenseMatrix {
*
* @param numRows number of rows
* @param numCols number of columns
- * @param colPtrs the index corresponding to the start of a new column
- * @param rowIndices the row index of the entry. They must be in strictly increasing order for each
- * column
- * @param values non-zero matrix entries in column major
+ * @param colPtrs the index corresponding to the start of a new column (if not transposed)
+ * @param rowIndices the row index of the entry (if not transposed). They must be in strictly
+ * increasing order for each column
+ * @param values nonzero matrix entries in column major (if not transposed)
* @param isTransposed whether the matrix is transposed. If true, the matrix can be considered
* Compressed Sparse Row (CSR) format, where `colPtrs` behaves as rowPtrs,
* and `rowIndices` behave as colIndices, and `values` are stored in row major.
@@ -434,7 +436,7 @@ class SparseMatrix(
this
}
- override def transpose: Matrix =
+ override def transpose: SparseMatrix =
new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed)
private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
@@ -464,9 +466,11 @@ class SparseMatrix(
}
}
- /** Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
- * set to false. */
- def toDense(): DenseMatrix = {
+ /**
+ * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
+ * set to false.
+ */
+ def toDense: DenseMatrix = {
new DenseMatrix(numRows, numCols, toArray)
}
}
@@ -593,7 +597,7 @@ object SparseMatrix {
}
/**
- * Generate a `SparseMatrix` consisting of i.i.d. uniform random numbers. The number of non-zero
+ * Generate a `SparseMatrix` consisting of `i.i.d`. uniform random numbers. The number of non-zero
* elements equal the ceiling of `numRows` x `numCols` x `density`
*
* @param numRows number of rows of the matrix
@@ -608,7 +612,7 @@ object SparseMatrix {
}
/**
- * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+ * Generate a `SparseMatrix` consisting of `i.i.d`. gaussian random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param density the desired density for the matrix
@@ -626,7 +630,7 @@ object SparseMatrix {
* @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero
* `values` on the diagonal
*/
- def diag(vector: Vector): SparseMatrix = {
+ def spdiag(vector: Vector): SparseMatrix = {
val n = vector.size
vector match {
case sVec: SparseVector =>
@@ -722,7 +726,7 @@ object Matrices {
def speye(n: Int): Matrix = SparseMatrix.speye(n)
/**
- * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
+ * Generate a `DenseMatrix` consisting of `i.i.d.` uniform random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param rng a random number generator
@@ -732,7 +736,7 @@ object Matrices {
DenseMatrix.rand(numRows, numCols, rng)
/**
- * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+ * Generate a `SparseMatrix` consisting of `i.i.d.` gaussian random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param density the desired density for the matrix
@@ -743,7 +747,7 @@ object Matrices {
SparseMatrix.sprand(numRows, numCols, density, rng)
/**
- * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
+ * Generate a `DenseMatrix` consisting of `i.i.d.` gaussian random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param rng a random number generator
@@ -753,7 +757,7 @@ object Matrices {
DenseMatrix.randn(numRows, numCols, rng)
/**
- * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+ * Generate a `SparseMatrix` consisting of `i.i.d.` gaussian random numbers.
* @param numRows number of rows of the matrix
* @param numCols number of columns of the matrix
* @param density the desired density for the matrix
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 8f75e6f46e..77785bdbd0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -244,8 +244,7 @@ object Vectors {
}
/**
- * Parses a string resulted from `Vector#toString` into
- * an [[org.apache.spark.mllib.linalg.Vector]].
+ * Parses a string resulted from [[Vector.toString]] into a [[Vector]].
*/
def parse(s: String): Vector = {
parseNumeric(NumericParser.parse(s))
@@ -483,6 +482,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
object DenseVector {
+ /** Extracts the value array from a dense vector. */
def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 3871152d06..1d25396313 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -21,7 +21,8 @@ import scala.collection.mutable.ArrayBuffer
import breeze.linalg.{DenseMatrix => BDM}
-import org.apache.spark.{SparkException, Logging, Partitioner}
+import org.apache.spark.{Logging, Partitioner, SparkException}
+import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
@@ -104,6 +105,8 @@ private[mllib] object GridPartitioner {
}
/**
+ * :: Experimental ::
+ *
* Represents a distributed matrix in blocks of local matrices.
*
* @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that
@@ -118,6 +121,7 @@ private[mllib] object GridPartitioner {
* @param nCols Number of columns of this matrix. If the supplied value is less than or equal to
* zero, the number of columns will be calculated when `numCols` is invoked.
*/
+@Experimental
class BlockMatrix(
val blocks: RDD[((Int, Int), Matrix)],
val rowsPerBlock: Int,
@@ -177,6 +181,10 @@ class BlockMatrix(
assert(cols <= nCols, s"The number of columns $cols is more than claimed $nCols.")
}
+ /**
+ * Validates the block matrix info against the matrix data (`blocks`) and throws an exception if
+ * any error is found.
+ */
def validate(): Unit = {
logDebug("Validating BlockMatrix...")
// check if the matrix is larger than the claimed dimensions
@@ -351,7 +359,7 @@ class BlockMatrix(
if (a.nonEmpty && b.nonEmpty) {
val C = b.head match {
case dense: DenseMatrix => a.head.multiply(dense)
- case sparse: SparseMatrix => a.head.multiply(sparse.toDense())
+ case sparse: SparseMatrix => a.head.multiply(sparse.toDense)
case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.")
}
Iterator(((blockRowIndex, blockColIndex), C.toBreeze))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 955c593a08..8341bb86af 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -29,13 +29,13 @@ import org.apache.spark.util.Utils
/**
* :: Experimental ::
- * Generator methods for creating RDDs comprised of i.i.d. samples from some distribution.
+ * Generator methods for creating RDDs comprised of `i.i.d.` samples from some distribution.
*/
@Experimental
object RandomRDDs {
/**
- * Generates an RDD comprised of i.i.d. samples from the uniform distribution `U(0.0, 1.0)`.
+ * Generates an RDD comprised of `i.i.d.` samples from the uniform distribution `U(0.0, 1.0)`.
*
* To transform the distribution in the generated RDD from `U(0.0, 1.0)` to `U(a, b)`, use
* `RandomRDDs.uniformRDD(sc, n, p, seed).map(v => a + (b - a) * v)`.
@@ -44,7 +44,7 @@ object RandomRDDs {
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples ~ `U(0.0, 1.0)`.
+ * @return RDD[Double] comprised of `i.i.d.` samples ~ `U(0.0, 1.0)`.
*/
def uniformRDD(
sc: SparkContext,
@@ -81,7 +81,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD comprised of i.i.d. samples from the standard normal distribution.
+ * Generates an RDD comprised of `i.i.d.` samples from the standard normal distribution.
*
* To transform the distribution in the generated RDD from standard normal to some other normal
* `N(mean, sigma^2^)`, use `RandomRDDs.normalRDD(sc, n, p, seed).map(v => mean + sigma * v)`.
@@ -90,7 +90,7 @@ object RandomRDDs {
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples ~ N(0.0, 1.0).
+ * @return RDD[Double] comprised of `i.i.d.` samples ~ N(0.0, 1.0).
*/
def normalRDD(
sc: SparkContext,
@@ -127,14 +127,15 @@ object RandomRDDs {
}
/**
- * Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean.
+ * Generates an RDD comprised of `i.i.d.` samples from the Poisson distribution with the input
+ * mean.
*
* @param sc SparkContext used to create the RDD.
* @param mean Mean, or lambda, for the Poisson distribution.
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+ * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
*/
def poissonRDD(
sc: SparkContext,
@@ -177,7 +178,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD comprised of i.i.d. samples from the exponential distribution with
+ * Generates an RDD comprised of `i.i.d.` samples from the exponential distribution with
* the input mean.
*
* @param sc SparkContext used to create the RDD.
@@ -185,7 +186,7 @@ object RandomRDDs {
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+ * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
*/
def exponentialRDD(
sc: SparkContext,
@@ -228,7 +229,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD comprised of i.i.d. samples from the gamma distribution with the input
+ * Generates an RDD comprised of `i.i.d.` samples from the gamma distribution with the input
* shape and scale.
*
* @param sc SparkContext used to create the RDD.
@@ -237,7 +238,7 @@ object RandomRDDs {
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+ * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
*/
def gammaRDD(
sc: SparkContext,
@@ -287,7 +288,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD comprised of i.i.d. samples from the log normal distribution with the input
+ * Generates an RDD comprised of `i.i.d.` samples from the log normal distribution with the input
* mean and standard deviation
*
* @param sc SparkContext used to create the RDD.
@@ -296,7 +297,7 @@ object RandomRDDs {
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+ * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
*/
def logNormalRDD(
sc: SparkContext,
@@ -348,14 +349,14 @@ object RandomRDDs {
/**
* :: DeveloperApi ::
- * Generates an RDD comprised of i.i.d. samples produced by the input RandomDataGenerator.
+ * Generates an RDD comprised of `i.i.d.` samples produced by the input RandomDataGenerator.
*
* @param sc SparkContext used to create the RDD.
* @param generator RandomDataGenerator used to populate the RDD.
* @param size Size of the RDD.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Double] comprised of i.i.d. samples produced by generator.
+ * @return RDD[Double] comprised of `i.i.d.` samples produced by generator.
*/
@DeveloperApi
def randomRDD[T: ClassTag](
@@ -370,7 +371,7 @@ object RandomRDDs {
// TODO Generate RDD[Vector] from multivariate distributions.
/**
- * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
* uniform distribution on `U(0.0, 1.0)`.
*
* @param sc SparkContext used to create the RDD.
@@ -424,7 +425,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
* standard normal distribution.
*
* @param sc SparkContext used to create the RDD.
@@ -432,7 +433,7 @@ object RandomRDDs {
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Vector] with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`.
+ * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ `N(0.0, 1.0)`.
*/
def normalVectorRDD(
sc: SparkContext,
@@ -478,7 +479,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from a
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from a
* log normal distribution.
*
* @param sc SparkContext used to create the RDD.
@@ -488,7 +489,7 @@ object RandomRDDs {
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Vector] with vectors containing i.i.d. samples.
+ * @return RDD[Vector] with vectors containing `i.i.d.` samples.
*/
def logNormalVectorRDD(
sc: SparkContext,
@@ -544,7 +545,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
* Poisson distribution with the input mean.
*
* @param sc SparkContext used to create the RDD.
@@ -553,7 +554,7 @@ object RandomRDDs {
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
* @param seed Random seed (default: a random long integer).
- * @return RDD[Vector] with vectors containing i.i.d. samples ~ Pois(mean).
+ * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Pois(mean).
*/
def poissonVectorRDD(
sc: SparkContext,
@@ -603,7 +604,7 @@ object RandomRDDs {
}
/**
- * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
* exponential distribution with the input mean.
*
* @param sc SparkContext used to create the RDD.
@@ -612,7 +613,7 @@ object RandomRDDs {
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
* @param seed Random seed (default: a random long integer).
- * @return RDD[Vector] with vectors containing i.i.d. samples ~ Exp(mean).
+ * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
*/
def exponentialVectorRDD(
sc: SparkContext,
@@ -665,7 +666,7 @@ object RandomRDDs {
/**
- * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
* gamma distribution with the input shape and scale.
*
* @param sc SparkContext used to create the RDD.
@@ -675,7 +676,7 @@ object RandomRDDs {
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
* @param seed Random seed (default: a random long integer).
- * @return RDD[Vector] with vectors containing i.i.d. samples ~ Exp(mean).
+ * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
*/
def gammaVectorRDD(
sc: SparkContext,
@@ -731,7 +732,7 @@ object RandomRDDs {
/**
* :: DeveloperApi ::
- * Generates an RDD[Vector] with vectors containing i.i.d. samples produced by the
+ * Generates an RDD[Vector] with vectors containing `i.i.d.` samples produced by the
* input RandomDataGenerator.
*
* @param sc SparkContext used to create the RDD.
@@ -740,7 +741,7 @@ object RandomRDDs {
* @param numCols Number of elements in each Vector.
* @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
* @param seed Random seed (default: a random long integer).
- * @return RDD[Vector] with vectors containing i.i.d. samples produced by generator.
+ * @return RDD[Vector] with vectors containing `i.i.d.` samples produced by generator.
*/
@DeveloperApi
def randomVectorRDD(sc: SparkContext,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index f4f51f2ac5..4bb28d1b1e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -25,10 +25,8 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
/**
- * :: Experimental ::
* A more compact class to represent a rating than Tuple3[Int, Int, Double].
*/
-@Experimental
case class Rating(user: Int, product: Int, rating: Double)
/**
@@ -135,10 +133,8 @@ class ALS private (
}
/**
- * :: Experimental ::
* Sets the constant used in computing confidence in implicit ALS. Default: 1.0.
*/
- @Experimental
def setAlpha(alpha: Double): this.type = {
this.alpha = alpha
this
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 5ed6477bae..cb70852e3c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -23,10 +23,13 @@ import java.util.Arrays.binarySearch
import scala.collection.mutable.ArrayBuffer
+import org.apache.spark.annotation.Experimental
import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD}
import org.apache.spark.rdd.RDD
/**
+ * :: Experimental ::
+ *
* Regression model for isotonic regression.
*
* @param boundaries Array of boundaries for which predictions are known.
@@ -35,6 +38,7 @@ import org.apache.spark.rdd.RDD
* Results of isotonic regression and therefore monotone.
* @param isotonic indicates whether this is isotonic or antitonic.
*/
+@Experimental
class IsotonicRegressionModel (
val boundaries: Array[Double],
val predictions: Array[Double],
@@ -123,6 +127,8 @@ class IsotonicRegressionModel (
}
/**
+ * :: Experimental ::
+ *
* Isotonic regression.
* Currently implemented using parallelized pool adjacent violators algorithm.
* Only univariate (single feature) algorithm supported.
@@ -130,14 +136,17 @@ class IsotonicRegressionModel (
* Sequential PAV implementation based on:
* Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
* "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61.
- * Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf
+ * Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]]
*
* Sequential PAV parallelization based on:
* Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
* "An approach to parallelizing isotonic regression."
* Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
- * Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf
+ * Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]]
+ *
+ * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
*/
+@Experimental
class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
/**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index 44a8dbb994..c854f12445 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -39,14 +39,14 @@ import org.apache.spark.streaming.dstream.DStream
*
* For example usage, see `StreamingLinearRegressionWithSGD`.
*
- * NOTE(Freeman): In some use cases, the order in which trainOn and predictOn
+ * NOTE: In some use cases, the order in which trainOn and predictOn
* are called in an application will affect the results. When called on
* the same DStream, if trainOn is called before predictOn, when new data
* arrive the model will update and the prediction will be based on the new
* model. Whereas if predictOn is called first, the prediction will use the model
* from the previous update.
*
- * NOTE(Freeman): It is ok to call predictOn repeatedly on multiple streams; this
+ * NOTE: It is ok to call predictOn repeatedly on multiple streams; this
* will generate predictions for each one all using the current model.
* It is also ok to call trainOn on different streams; this will update
* the model using each of the different sources, in sequence.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 3cf4e807b4..b3fad0c52d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -26,36 +26,32 @@ import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult}
import org.apache.spark.rdd.RDD
/**
+ * :: Experimental ::
* API for statistical functions in MLlib.
*/
@Experimental
object Statistics {
/**
- * :: Experimental ::
* Computes column-wise summary statistics for the input RDD[Vector].
*
* @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
* @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
*/
- @Experimental
def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
new RowMatrix(X).computeColumnSummaryStatistics()
}
/**
- * :: Experimental ::
* Compute the Pearson correlation matrix for the input RDD of Vectors.
* Columns with 0 covariance produce NaN entries in the correlation matrix.
*
* @param X an RDD[Vector] for which the correlation matrix is to be computed.
* @return Pearson correlation matrix comparing columns in X.
*/
- @Experimental
def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
/**
- * :: Experimental ::
* Compute the correlation matrix for the input RDD of Vectors using the specified method.
* Methods currently supported: `pearson` (default), `spearman`.
*
@@ -69,11 +65,9 @@ object Statistics {
* Supported: `pearson` (default), `spearman`
* @return Correlation matrix comparing columns in X.
*/
- @Experimental
def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
/**
- * :: Experimental ::
* Compute the Pearson correlation for the input RDDs.
* Returns NaN if either vector has 0 variance.
*
@@ -84,11 +78,9 @@ object Statistics {
* @param y RDD[Double] of the same cardinality as x.
* @return A Double containing the Pearson correlation between the two input RDD[Double]s
*/
- @Experimental
def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
/**
- * :: Experimental ::
* Compute the correlation for the input RDDs using the specified method.
* Methods currently supported: `pearson` (default), `spearman`.
*
@@ -99,14 +91,12 @@ object Statistics {
* @param y RDD[Double] of the same cardinality as x.
* @param method String specifying the method to use for computing correlation.
* Supported: `pearson` (default), `spearman`
- *@return A Double containing the correlation between the two input RDD[Double]s using the
+ * @return A Double containing the correlation between the two input RDD[Double]s using the
* specified method.
*/
- @Experimental
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
/**
- * :: Experimental ::
* Conduct Pearson's chi-squared goodness of fit test of the observed data against the
* expected distribution.
*
@@ -120,13 +110,11 @@ object Statistics {
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
*/
- @Experimental
def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
ChiSqTest.chiSquared(observed, expected)
}
/**
- * :: Experimental ::
* Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
* distribution, with each category having an expected frequency of `1 / observed.size`.
*
@@ -136,11 +124,9 @@ object Statistics {
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
*/
- @Experimental
def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
/**
- * :: Experimental ::
* Conduct Pearson's independence test on the input contingency matrix, which cannot contain
* negative entries or columns or rows that sum up to 0.
*
@@ -148,11 +134,9 @@ object Statistics {
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
*/
- @Experimental
def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
/**
- * :: Experimental ::
* Conduct Pearson's independence test for every feature against the label across the input RDD.
* For each feature, the (feature, label) pairs are converted into a contingency matrix for which
* the chi-squared statistic is computed. All label and feature values must be categorical.
@@ -162,7 +146,6 @@ object Statistics {
* @return an array containing the ChiSquaredTestResult for every feature against the label.
* The order of the elements in the returned array reflects the order of input features.
*/
- @Experimental
def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
ChiSqTest.chiSquaredFeatures(data)
}
diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
index 704d484d0b..3349c50224 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
@@ -71,8 +71,8 @@ public class JavaMatricesSuite implements Serializable {
Matrix sm = Matrices.diag(sv);
DenseMatrix d = DenseMatrix.diag(v);
DenseMatrix sd = DenseMatrix.diag(sv);
- SparseMatrix s = SparseMatrix.diag(v);
- SparseMatrix ss = SparseMatrix.diag(sv);
+ SparseMatrix s = SparseMatrix.spdiag(v);
+ SparseMatrix ss = SparseMatrix.spdiag(sv);
assertArrayEquals(m.toArray(), sm.toArray(), 0.0);
assertArrayEquals(d.toArray(), sm.toArray(), 0.0);
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index b1ebfde0e5..c098b5458f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -137,8 +137,8 @@ class MatricesSuite extends FunSuite {
val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values)
val deMat1 = new DenseMatrix(m, n, allValues)
- val spMat2 = deMat1.toSparse()
- val deMat2 = spMat1.toDense()
+ val spMat2 = deMat1.toSparse
+ val deMat2 = spMat1.toDense
assert(spMat1.toBreeze === spMat2.toBreeze)
assert(deMat1.toBreeze === deMat2.toBreeze)
@@ -185,8 +185,8 @@ class MatricesSuite extends FunSuite {
assert(!dA.toArray.eq(dAT.toArray), "has to have a new array")
assert(dA.values.eq(dAT.transpose.asInstanceOf[DenseMatrix].values), "should not copy array")
- assert(dAT.toSparse().toBreeze === sATexpected.toBreeze)
- assert(sAT.toDense().toBreeze === dATexpected.toBreeze)
+ assert(dAT.toSparse.toBreeze === sATexpected.toBreeze)
+ assert(sAT.toDense.toBreeze === dATexpected.toBreeze)
}
test("foreachActive") {