aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMechCoder <manojkumarsivaraj334@gmail.com>2015-08-20 14:56:08 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-20 14:59:55 -0700
commit560ec1268b824acc01d347a3fbc78ac16216a9b0 (patch)
tree04d31d805b06bfe329a58c72ddac2c02d729762b
parent2e0d2a9cc3cb7021e3bdd032d079cf6c8916c725 (diff)
downloadspark-560ec1268b824acc01d347a3fbc78ac16216a9b0.tar.gz
spark-560ec1268b824acc01d347a3fbc78ac16216a9b0.tar.bz2
spark-560ec1268b824acc01d347a3fbc78ac16216a9b0.zip
[SPARK-10108] Add since tags to mllib.feature
Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #8309 from MechCoder/tags_feature. (cherry picked from commit 7cfc0750e14f2c1b3847e4720cc02150253525a9) Signed-off-by: Xiangrui Meng <meng@databricks.com>
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala19
9 files changed, 76 insertions, 11 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 5f8c1dea23..fdd974d7a3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.feature
import scala.collection.mutable.ArrayBuilder
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
@@ -31,8 +31,10 @@ import org.apache.spark.rdd.RDD
*
* @param selectedFeatures list of indices to select (filter). Must be ordered asc
*/
+@Since("1.3.0")
@Experimental
-class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransformer {
+class ChiSqSelectorModel (
+ @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer {
require(isSorted(selectedFeatures), "Array has to be sorted asc")
@@ -52,6 +54,7 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf
* @param vector vector to be transformed.
* @return transformed vector.
*/
+ @Since("1.3.0")
override def transform(vector: Vector): Vector = {
compress(vector, selectedFeatures)
}
@@ -107,8 +110,10 @@ class ChiSqSelectorModel (val selectedFeatures: Array[Int]) extends VectorTransf
* @param numTopFeatures number of features that selector will select
* (ordered by statistic value descending)
*/
+@Since("1.3.0")
@Experimental
-class ChiSqSelector (val numTopFeatures: Int) extends Serializable {
+class ChiSqSelector (
+ @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
/**
* Returns a ChiSquared feature selector.
@@ -117,6 +122,7 @@ class ChiSqSelector (val numTopFeatures: Int) extends Serializable {
* Real-valued features will be treated as categorical for each distinct value.
* Apply feature discretizer before using this function.
*/
+ @Since("1.3.0")
def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
val indices = Statistics.chiSqTest(data)
.zipWithIndex.sortBy { case (res, _) => -res.statistic }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index d67fe6c3ee..33e2d17bb4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.feature
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg._
/**
@@ -27,6 +27,7 @@ import org.apache.spark.mllib.linalg._
* multiplier.
* @param scalingVec The values used to scale the reference vector's individual components.
*/
+@Since("1.4.0")
@Experimental
class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
@@ -36,6 +37,7 @@ class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
* @param vector vector to be transformed.
* @return transformed vector.
*/
+ @Since("1.4.0")
override def transform(vector: Vector): Vector = {
require(vector.size == scalingVec.size,
s"vector sizes do not match: Expected ${scalingVec.size} but found ${vector.size}")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
index c534758183..e47d524b61 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala
@@ -22,7 +22,7 @@ import java.lang.{Iterable => JavaIterable}
import scala.collection.JavaConverters._
import scala.collection.mutable
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
@@ -34,19 +34,25 @@ import org.apache.spark.util.Utils
*
* @param numFeatures number of features (default: 2^20^)
*/
+@Since("1.1.0")
@Experimental
class HashingTF(val numFeatures: Int) extends Serializable {
+ /**
+ */
+ @Since("1.1.0")
def this() = this(1 << 20)
/**
* Returns the index of the input term.
*/
+ @Since("1.1.0")
def indexOf(term: Any): Int = Utils.nonNegativeMod(term.##, numFeatures)
/**
* Transforms the input document into a sparse term frequency vector.
*/
+ @Since("1.1.0")
def transform(document: Iterable[_]): Vector = {
val termFrequencies = mutable.HashMap.empty[Int, Double]
document.foreach { term =>
@@ -59,6 +65,7 @@ class HashingTF(val numFeatures: Int) extends Serializable {
/**
* Transforms the input document into a sparse term frequency vector (Java version).
*/
+ @Since("1.1.0")
def transform(document: JavaIterable[_]): Vector = {
transform(document.asScala)
}
@@ -66,6 +73,7 @@ class HashingTF(val numFeatures: Int) extends Serializable {
/**
* Transforms the input document to term frequency vectors.
*/
+ @Since("1.1.0")
def transform[D <: Iterable[_]](dataset: RDD[D]): RDD[Vector] = {
dataset.map(this.transform)
}
@@ -73,6 +81,7 @@ class HashingTF(val numFeatures: Int) extends Serializable {
/**
* Transforms the input document to term frequency vectors (Java version).
*/
+ @Since("1.1.0")
def transform[D <: JavaIterable[_]](dataset: JavaRDD[D]): JavaRDD[Vector] = {
dataset.rdd.map(this.transform).toJavaRDD()
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index 3fab7ea79b..d5353ddd97 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -19,7 +19,7 @@ package org.apache.spark.mllib.feature
import breeze.linalg.{DenseVector => BDV}
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.rdd.RDD
@@ -37,6 +37,7 @@ import org.apache.spark.rdd.RDD
* @param minDocFreq minimum of documents in which a term
* should appear for filtering
*/
+@Since("1.1.0")
@Experimental
class IDF(val minDocFreq: Int) {
@@ -48,6 +49,7 @@ class IDF(val minDocFreq: Int) {
* Computes the inverse document frequency.
* @param dataset an RDD of term frequency vectors
*/
+ @Since("1.1.0")
def fit(dataset: RDD[Vector]): IDFModel = {
val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
minDocFreq = minDocFreq))(
@@ -61,6 +63,7 @@ class IDF(val minDocFreq: Int) {
* Computes the inverse document frequency.
* @param dataset a JavaRDD of term frequency vectors
*/
+ @Since("1.1.0")
def fit(dataset: JavaRDD[Vector]): IDFModel = {
fit(dataset.rdd)
}
@@ -171,6 +174,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable {
* @param dataset an RDD of term frequency vectors
* @return an RDD of TF-IDF vectors
*/
+ @Since("1.1.0")
def transform(dataset: RDD[Vector]): RDD[Vector] = {
val bcIdf = dataset.context.broadcast(idf)
dataset.mapPartitions(iter => iter.map(v => IDFModel.transform(bcIdf.value, v)))
@@ -182,6 +186,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable {
* @param v a term frequency vector
* @return a TF-IDF vector
*/
+ @Since("1.3.0")
def transform(v: Vector): Vector = IDFModel.transform(idf, v)
/**
@@ -189,6 +194,7 @@ class IDFModel private[spark] (val idf: Vector) extends Serializable {
* @param dataset a JavaRDD of term frequency vectors
* @return a JavaRDD of TF-IDF vectors
*/
+ @Since("1.1.0")
def transform(dataset: JavaRDD[Vector]): JavaRDD[Vector] = {
transform(dataset.rdd).toJavaRDD()
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
index 32848e039e..0e070257d9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.feature
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
/**
@@ -31,9 +31,11 @@ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors
*
* @param p Normalization in L^p^ space, p = 2 by default.
*/
+@Since("1.1.0")
@Experimental
class Normalizer(p: Double) extends VectorTransformer {
+ @Since("1.1.0")
def this() = this(2)
require(p >= 1.0)
@@ -44,6 +46,7 @@ class Normalizer(p: Double) extends VectorTransformer {
* @param vector vector to be normalized.
* @return normalized vector. If the norm of the input is zero, it will return the input vector.
*/
+ @Since("1.1.0")
override def transform(vector: Vector): Vector = {
val norm = Vectors.norm(vector, p)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
index 2a66263d8b..a48b7bba66 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala
@@ -17,6 +17,7 @@
package org.apache.spark.mllib.feature
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed.RowMatrix
@@ -27,6 +28,7 @@ import org.apache.spark.rdd.RDD
*
* @param k number of principal components
*/
+@Since("1.4.0")
class PCA(val k: Int) {
require(k >= 1, s"PCA requires a number of principal components k >= 1 but was given $k")
@@ -35,6 +37,7 @@ class PCA(val k: Int) {
*
* @param sources source vectors
*/
+ @Since("1.4.0")
def fit(sources: RDD[Vector]): PCAModel = {
require(k <= sources.first().size,
s"source vector size is ${sources.first().size} must be greater than k=$k")
@@ -58,7 +61,10 @@ class PCA(val k: Int) {
new PCAModel(k, pc)
}
- /** Java-friendly version of [[fit()]] */
+ /**
+ * Java-friendly version of [[fit()]]
+ */
+ @Since("1.4.0")
def fit(sources: JavaRDD[Vector]): PCAModel = fit(sources.rdd)
}
@@ -76,6 +82,7 @@ class PCAModel private[spark] (val k: Int, val pc: DenseMatrix) extends VectorTr
* Vector must be the same length as the source vectors given to [[PCA.fit()]].
* @return transformed vector. Vector will be of length k.
*/
+ @Since("1.4.0")
override def transform(vector: Vector): Vector = {
vector match {
case dv: DenseVector =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
index c73b8f2580..b95d5a8990 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -18,7 +18,7 @@
package org.apache.spark.mllib.feature
import org.apache.spark.Logging
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.rdd.RDD
@@ -32,9 +32,11 @@ import org.apache.spark.rdd.RDD
* dense output, so this does not work on sparse input and will raise an exception.
* @param withStd True by default. Scales the data to unit standard deviation.
*/
+@Since("1.1.0")
@Experimental
class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
+ @Since("1.1.0")
def this() = this(false, true)
if (!(withMean || withStd)) {
@@ -47,6 +49,7 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
* @param data The data used to compute the mean and variance to build the transformation model.
* @return a StandardScalarModel
*/
+ @Since("1.1.0")
def fit(data: RDD[Vector]): StandardScalerModel = {
// TODO: skip computation if both withMean and withStd are false
val summary = data.treeAggregate(new MultivariateOnlineSummarizer)(
@@ -69,6 +72,7 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
* @param withStd whether to scale the data to have unit standard deviation
* @param withMean whether to center the data before scaling
*/
+@Since("1.1.0")
@Experimental
class StandardScalerModel (
val std: Vector,
@@ -76,6 +80,9 @@ class StandardScalerModel (
var withStd: Boolean,
var withMean: Boolean) extends VectorTransformer {
+ /**
+ */
+ @Since("1.3.0")
def this(std: Vector, mean: Vector) {
this(std, mean, withStd = std != null, withMean = mean != null)
require(this.withStd || this.withMean,
@@ -86,8 +93,10 @@ class StandardScalerModel (
}
}
+ @Since("1.3.0")
def this(std: Vector) = this(std, null)
+ @Since("1.3.0")
@DeveloperApi
def setWithMean(withMean: Boolean): this.type = {
require(!(withMean && this.mean == null), "cannot set withMean to true while mean is null")
@@ -95,6 +104,7 @@ class StandardScalerModel (
this
}
+ @Since("1.3.0")
@DeveloperApi
def setWithStd(withStd: Boolean): this.type = {
require(!(withStd && this.std == null),
@@ -115,6 +125,7 @@ class StandardScalerModel (
* @return Standardized vector. If the std of a column is zero, it will return default `0.0`
* for the column with zero std.
*/
+ @Since("1.1.0")
override def transform(vector: Vector): Vector = {
require(mean.size == vector.size)
if (withMean) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
index 7358c1c84f..5778fd1d09 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
@@ -17,7 +17,7 @@
package org.apache.spark.mllib.feature
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
@@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD
* :: DeveloperApi ::
* Trait for transformation of a vector
*/
+@Since("1.1.0")
@DeveloperApi
trait VectorTransformer extends Serializable {
@@ -35,6 +36,7 @@ trait VectorTransformer extends Serializable {
* @param vector vector to be transformed.
* @return transformed vector.
*/
+ @Since("1.1.0")
def transform(vector: Vector): Vector
/**
@@ -43,6 +45,7 @@ trait VectorTransformer extends Serializable {
* @param data RDD[Vector] to be transformed.
* @return transformed RDD[Vector].
*/
+ @Since("1.1.0")
def transform(data: RDD[Vector]): RDD[Vector] = {
// Later in #1498 , all RDD objects are sent via broadcasting instead of akka.
// So it should be no longer necessary to explicitly broadcast `this` object.
@@ -55,6 +58,7 @@ trait VectorTransformer extends Serializable {
* @param data JavaRDD[Vector] to be transformed.
* @return transformed JavaRDD[Vector].
*/
+ @Since("1.1.0")
def transform(data: JavaRDD[Vector]): JavaRDD[Vector] = {
transform(data.rdd)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index cbbd2b0c8d..e6f45ae4b0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -32,7 +32,7 @@ import org.json4s.jackson.JsonMethods._
import org.apache.spark.Logging
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors, DenseMatrix, BLAS, DenseVector}
import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -70,6 +70,7 @@ private case class VocabWord(
* and
* Distributed Representations of Words and Phrases and their Compositionality.
*/
+@Since("1.1.0")
@Experimental
class Word2Vec extends Serializable with Logging {
@@ -83,6 +84,7 @@ class Word2Vec extends Serializable with Logging {
/**
* Sets vector size (default: 100).
*/
+ @Since("1.1.0")
def setVectorSize(vectorSize: Int): this.type = {
this.vectorSize = vectorSize
this
@@ -91,6 +93,7 @@ class Word2Vec extends Serializable with Logging {
/**
* Sets initial learning rate (default: 0.025).
*/
+ @Since("1.1.0")
def setLearningRate(learningRate: Double): this.type = {
this.learningRate = learningRate
this
@@ -99,6 +102,7 @@ class Word2Vec extends Serializable with Logging {
/**
* Sets number of partitions (default: 1). Use a small number for accuracy.
*/
+ @Since("1.1.0")
def setNumPartitions(numPartitions: Int): this.type = {
require(numPartitions > 0, s"numPartitions must be greater than 0 but got $numPartitions")
this.numPartitions = numPartitions
@@ -109,6 +113,7 @@ class Word2Vec extends Serializable with Logging {
* Sets number of iterations (default: 1), which should be smaller than or equal to number of
* partitions.
*/
+ @Since("1.1.0")
def setNumIterations(numIterations: Int): this.type = {
this.numIterations = numIterations
this
@@ -117,6 +122,7 @@ class Word2Vec extends Serializable with Logging {
/**
* Sets random seed (default: a random long integer).
*/
+ @Since("1.1.0")
def setSeed(seed: Long): this.type = {
this.seed = seed
this
@@ -126,6 +132,7 @@ class Word2Vec extends Serializable with Logging {
* Sets minCount, the minimum number of times a token must appear to be included in the word2vec
* model's vocabulary (default: 5).
*/
+ @Since("1.3.0")
def setMinCount(minCount: Int): this.type = {
this.minCount = minCount
this
@@ -263,6 +270,7 @@ class Word2Vec extends Serializable with Logging {
* @param dataset an RDD of words
* @return a Word2VecModel
*/
+ @Since("1.1.0")
def fit[S <: Iterable[String]](dataset: RDD[S]): Word2VecModel = {
val words = dataset.flatMap(x => x)
@@ -412,6 +420,7 @@ class Word2Vec extends Serializable with Logging {
* @param dataset a JavaRDD of words
* @return a Word2VecModel
*/
+ @Since("1.1.0")
def fit[S <: JavaIterable[String]](dataset: JavaRDD[S]): Word2VecModel = {
fit(dataset.rdd.map(_.asScala))
}
@@ -454,6 +463,7 @@ class Word2VecModel private[mllib] (
wordVecNorms
}
+ @Since("1.5.0")
def this(model: Map[String, Array[Float]]) = {
this(Word2VecModel.buildWordIndex(model), Word2VecModel.buildWordVectors(model))
}
@@ -469,6 +479,7 @@ class Word2VecModel private[mllib] (
override protected def formatVersion = "1.0"
+ @Since("1.4.0")
def save(sc: SparkContext, path: String): Unit = {
Word2VecModel.SaveLoadV1_0.save(sc, path, getVectors)
}
@@ -478,6 +489,7 @@ class Word2VecModel private[mllib] (
* @param word a word
* @return vector representation of word
*/
+ @Since("1.1.0")
def transform(word: String): Vector = {
wordIndex.get(word) match {
case Some(ind) =>
@@ -494,6 +506,7 @@ class Word2VecModel private[mllib] (
* @param num number of synonyms to find
* @return array of (word, cosineSimilarity)
*/
+ @Since("1.1.0")
def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
val vector = transform(word)
findSynonyms(vector, num)
@@ -505,6 +518,7 @@ class Word2VecModel private[mllib] (
* @param num number of synonyms to find
* @return array of (word, cosineSimilarity)
*/
+ @Since("1.1.0")
def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
require(num > 0, "Number of similar words should > 0")
// TODO: optimize top-k
@@ -534,6 +548,7 @@ class Word2VecModel private[mllib] (
/**
* Returns a map of words to their vector representations.
*/
+ @Since("1.2.0")
def getVectors: Map[String, Array[Float]] = {
wordIndex.map { case (word, ind) =>
(word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
@@ -541,6 +556,7 @@ class Word2VecModel private[mllib] (
}
}
+@Since("1.4.0")
@Experimental
object Word2VecModel extends Loader[Word2VecModel] {
@@ -600,6 +616,7 @@ object Word2VecModel extends Loader[Word2VecModel] {
}
}
+ @Since("1.4.0")
override def load(sc: SparkContext, path: String): Word2VecModel = {
val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path)