aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-08-25 22:33:48 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-25 22:33:55 -0700
commitbe0c9915c0084a187933f338e51e606dc68e93af (patch)
tree48bef486f677c5cba11baa021210df1ba58f070b
parentb7766699aef65586b0c3af96fb625efaa218d2b2 (diff)
downloadspark-be0c9915c0084a187933f338e51e606dc68e93af.tar.gz
spark-be0c9915c0084a187933f338e51e606dc68e93af.tar.bz2
spark-be0c9915c0084a187933f338e51e606dc68e93af.zip
[SPARK-10234] [MLLIB] update since version in mllib.clustering
Same as #8421 but for `mllib.clustering`. cc feynmanliang yu-iskw Author: Xiangrui Meng <meng@databricks.com> Closes #8435 from mengxr/SPARK-10234. (cherry picked from commit d703372f86d6a59383ba8569fcd9d379849cffbf) Signed-off-by: Xiangrui Meng <meng@databricks.com>
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala1
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala28
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala10
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala15
7 files changed, 44 insertions, 23 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index daa947e81d..f82bd82c20 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -53,6 +53,7 @@ import org.apache.spark.util.Utils
* @param maxIterations The maximum number of iterations to perform
*/
@Experimental
+@Since("1.3.0")
class GaussianMixture private (
private var k: Int,
private var convergenceTol: Double,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 1a10a8b624..7f6163e04b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row}
*/
@Since("1.3.0")
@Experimental
-class GaussianMixtureModel(
- val weights: Array[Double],
- val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
+class GaussianMixtureModel @Since("1.3.0") (
+ @Since("1.3.0") val weights: Array[Double],
+ @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable {
require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
@@ -178,7 +178,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
(weight, new MultivariateGaussian(mu, sigma))
}.unzip
- return new GaussianMixtureModel(weights.toArray, gaussians.toArray)
+ new GaussianMixtureModel(weights.toArray, gaussians.toArray)
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 3e9545a74b..46920fffe6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom
* This is an iterative algorithm that will make multiple passes over the data, so any RDDs given
* to it should be cached by the user.
*/
+@Since("0.8.0")
class KMeans private (
private var k: Int,
private var maxIterations: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index 13fc4a81ff..45021f4375 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -37,8 +37,8 @@ import org.apache.spark.sql.Row
* A clustering model for K-means. Each point belongs to the cluster with the closest center.
*/
@Since("0.8.0")
-class KMeansModel (
- val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {
+class KMeansModel @Since("1.1.0") (@Since("1.0.0") val clusterCenters: Array[Vector])
+ extends Saveable with Serializable with PMMLExportable {
/**
* A Java-friendly constructor that takes an Iterable of Vectors.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 432bbedc8d..15129e0dd5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -43,12 +43,15 @@ import org.apache.spark.util.BoundedPriorityQueue
* including local and distributed data structures.
*/
@Experimental
+@Since("1.3.0")
abstract class LDAModel private[clustering] extends Saveable {
/** Number of topics */
+ @Since("1.3.0")
def k: Int
/** Vocabulary size (number of terms or terms in the vocabulary) */
+ @Since("1.3.0")
def vocabSize: Int
/**
@@ -57,6 +60,7 @@ abstract class LDAModel private[clustering] extends Saveable {
*
* This is the parameter to a Dirichlet distribution.
*/
+ @Since("1.5.0")
def docConcentration: Vector
/**
@@ -68,6 +72,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* Note: The topics' distributions over terms are called "beta" in the original LDA paper
* by Blei et al., but are called "phi" in many later papers such as Asuncion et al., 2009.
*/
+ @Since("1.5.0")
def topicConcentration: Double
/**
@@ -81,6 +86,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* This is a matrix of size vocabSize x k, where each column is a topic.
* No guarantees are given about the ordering of the topics.
*/
+ @Since("1.3.0")
def topicsMatrix: Matrix
/**
@@ -91,6 +97,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* (term indices, term weights in topic).
* Each topic's terms are sorted in order of decreasing weight.
*/
+ @Since("1.3.0")
def describeTopics(maxTermsPerTopic: Int): Array[(Array[Int], Array[Double])]
/**
@@ -102,6 +109,7 @@ abstract class LDAModel private[clustering] extends Saveable {
* (term indices, term weights in topic).
* Each topic's terms are sorted in order of decreasing weight.
*/
+ @Since("1.3.0")
def describeTopics(): Array[(Array[Int], Array[Double])] = describeTopics(vocabSize)
/* TODO (once LDA can be trained with Strings or given a dictionary)
@@ -185,10 +193,11 @@ abstract class LDAModel private[clustering] extends Saveable {
* @param topics Inferred topics (vocabSize x k matrix).
*/
@Experimental
+@Since("1.3.0")
class LocalLDAModel private[clustering] (
- val topics: Matrix,
- override val docConcentration: Vector,
- override val topicConcentration: Double,
+ @Since("1.3.0") val topics: Matrix,
+ @Since("1.5.0") override val docConcentration: Vector,
+ @Since("1.5.0") override val topicConcentration: Double,
override protected[clustering] val gammaShape: Double = 100)
extends LDAModel with Serializable {
@@ -376,6 +385,7 @@ class LocalLDAModel private[clustering] (
}
@Experimental
+@Since("1.5.0")
object LocalLDAModel extends Loader[LocalLDAModel] {
private object SaveLoadV1_0 {
@@ -479,13 +489,14 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
* than the [[LocalLDAModel]].
*/
@Experimental
+@Since("1.3.0")
class DistributedLDAModel private[clustering] (
private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
private[clustering] val globalTopicTotals: LDA.TopicCounts,
- val k: Int,
- val vocabSize: Int,
- override val docConcentration: Vector,
- override val topicConcentration: Double,
+ @Since("1.3.0") val k: Int,
+ @Since("1.3.0") val vocabSize: Int,
+ @Since("1.5.0") override val docConcentration: Vector,
+ @Since("1.5.0") override val topicConcentration: Double,
private[spark] val iterationTimes: Array[Double],
override protected[clustering] val gammaShape: Double = 100)
extends LDAModel {
@@ -603,6 +614,7 @@ class DistributedLDAModel private[clustering] (
* (term indices, topic indices). Note that terms will be omitted if not present in
* the document.
*/
+ @Since("1.5.0")
lazy val topicAssignments: RDD[(Long, Array[Int], Array[Int])] = {
// For reference, compare the below code with the core part of EMLDAOptimizer.next().
val eta = topicConcentration
@@ -634,6 +646,7 @@ class DistributedLDAModel private[clustering] (
}
/** Java-friendly version of [[topicAssignments]] */
+ @Since("1.5.0")
lazy val javaTopicAssignments: JavaRDD[(java.lang.Long, Array[Int], Array[Int])] = {
topicAssignments.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Int])]].toJavaRDD()
}
@@ -770,6 +783,7 @@ class DistributedLDAModel private[clustering] (
@Experimental
+@Since("1.5.0")
object DistributedLDAModel extends Loader[DistributedLDAModel] {
private object SaveLoadV1_0 {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 396b36f2f6..da234bdbb2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -42,9 +42,10 @@ import org.apache.spark.{Logging, SparkContext, SparkException}
*/
@Since("1.3.0")
@Experimental
-class PowerIterationClusteringModel(
- val k: Int,
- val assignments: RDD[PowerIterationClustering.Assignment]) extends Saveable with Serializable {
+class PowerIterationClusteringModel @Since("1.3.0") (
+ @Since("1.3.0") val k: Int,
+ @Since("1.3.0") val assignments: RDD[PowerIterationClustering.Assignment])
+ extends Saveable with Serializable {
@Since("1.4.0")
override def save(sc: SparkContext, path: String): Unit = {
@@ -56,6 +57,8 @@ class PowerIterationClusteringModel(
@Since("1.4.0")
object PowerIterationClusteringModel extends Loader[PowerIterationClusteringModel] {
+
+ @Since("1.4.0")
override def load(sc: SparkContext, path: String): PowerIterationClusteringModel = {
PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
}
@@ -120,6 +123,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
* @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
*/
@Experimental
+@Since("1.3.0")
class PowerIterationClustering private[clustering] (
private var k: Int,
private var maxIterations: Int,
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 41f2668ec6..1d50ffec96 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -66,9 +66,10 @@ import org.apache.spark.util.random.XORShiftRandom
*/
@Since("1.2.0")
@Experimental
-class StreamingKMeansModel(
- override val clusterCenters: Array[Vector],
- val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging {
+class StreamingKMeansModel @Since("1.2.0") (
+ @Since("1.2.0") override val clusterCenters: Array[Vector],
+ @Since("1.2.0") val clusterWeights: Array[Double])
+ extends KMeansModel(clusterCenters) with Logging {
/**
* Perform a k-means update on a batch of data.
@@ -168,10 +169,10 @@ class StreamingKMeansModel(
*/
@Since("1.2.0")
@Experimental
-class StreamingKMeans(
- var k: Int,
- var decayFactor: Double,
- var timeUnit: String) extends Logging with Serializable {
+class StreamingKMeans @Since("1.2.0") (
+ @Since("1.2.0") var k: Int,
+ @Since("1.2.0") var decayFactor: Double,
+ @Since("1.2.0") var timeUnit: String) extends Logging with Serializable {
@Since("1.2.0")
def this() = this(2, 1.0, StreamingKMeans.BATCHES)