diff options
author | Davies Liu <davies@databricks.com> | 2014-10-28 03:50:22 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-10-28 03:50:22 -0700 |
commit | fae095bc7c4097859af522ced77f09cf6be17691 (patch) | |
tree | 8b4be3f716e8312c372105b4cc5132860bed2412 /mllib | |
parent | 46c63417c1bb1aea07baf9036cc5b8f1c3781bbe (diff) | |
download | spark-fae095bc7c4097859af522ced77f09cf6be17691.tar.gz spark-fae095bc7c4097859af522ced77f09cf6be17691.tar.bz2 spark-fae095bc7c4097859af522ced77f09cf6be17691.zip |
[SPARK-3961] [MLlib] [PySpark] Python API for mllib.feature
Added completed Python API for MLlib.feature
Normalizer
StandardScalerModel
StandardScaler
HashTF
IDFModel
IDF
cc mengxr
Author: Davies Liu <davies@databricks.com>
Author: Davies Liu <davies.liu@gmail.com>
Closes #2819 from davies/feature and squashes the following commits:
4f48f48 [Davies Liu] add a note for HashingTF
67f6d21 [Davies Liu] address comments
b628693 [Davies Liu] rollback changes in Word2Vec
efb4f4f [Davies Liu] Merge branch 'master' into feature
806c7c2 [Davies Liu] address comments
3abb8c2 [Davies Liu] address comments
59781b9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into feature
a405ae7 [Davies Liu] fix tests
7a1891a [Davies Liu] fix tests
486795f [Davies Liu] update programming guide, HashTF -> HashingTF
8a50584 [Davies Liu] Python API for mllib.feature
Diffstat (limited to 'mllib')
3 files changed, 60 insertions, 4 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index b478c21537..485abe2723 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -31,8 +31,7 @@ import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.python.{PythonRDD, SerDeUtil} import org.apache.spark.mllib.classification._ import org.apache.spark.mllib.clustering._ -import org.apache.spark.mllib.feature.Word2Vec -import org.apache.spark.mllib.feature.Word2VecModel +import org.apache.spark.mllib.feature._ import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.random.{RandomRDDs => RG} @@ -292,6 +291,43 @@ class PythonMLLibAPI extends Serializable { } /** + * Java stub for Normalizer.transform() + */ + def normalizeVector(p: Double, vector: Vector): Vector = { + new Normalizer(p).transform(vector) + } + + /** + * Java stub for Normalizer.transform() + */ + def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = { + new Normalizer(p).transform(rdd) + } + + /** + * Java stub for IDF.fit(). This stub returns a + * handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on + * exit; see the Py4J documentation. + */ + def fitStandardScaler( + withMean: Boolean, + withStd: Boolean, + data: JavaRDD[Vector]): StandardScalerModel = { + new StandardScaler(withMean, withStd).fit(data.rdd) + } + + /** + * Java stub for IDF.fit(). This stub returns a + * handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on + * exit; see the Py4J documentation. + */ + def fitIDF(minDocFreq: Int, dataset: JavaRDD[Vector]): IDFModel = { + new IDF(minDocFreq).fit(dataset) + } + + /** * Java stub for Python mllib Word2Vec fit(). This stub returns a * handle to the Java object instead of the content of the Java object. * Extra care needs to be taken in the Python code to ensure it gets freed on @@ -328,6 +364,15 @@ class PythonMLLibAPI extends Serializable { model.transform(word) } + /** + * Transforms an RDD of words to its vector representation + * @param rdd an RDD of words + * @return an RDD of vector representations of words + */ + def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = { + rdd.rdd.map(model.transform) + } + def findSynonyms(word: String, num: Int): java.util.List[java.lang.Object] = { val vec = transform(word) findSynonyms(vec, num) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala index 415a845332..7358c1c84f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.feature import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD @@ -48,4 +49,14 @@ trait VectorTransformer extends Serializable { data.map(x => this.transform(x)) } + /** + * Applies transformation on an JavaRDD[Vector]. + * + * @param data JavaRDD[Vector] to be transformed. + * @return transformed JavaRDD[Vector]. + */ + def transform(data: JavaRDD[Vector]): JavaRDD[Vector] = { + transform(data.rdd) + } + } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index d321994c2a..f5f7ad613d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -432,7 +432,7 @@ class Word2VecModel private[mllib] ( throw new IllegalStateException(s"$word not in vocabulary") } } - + /** * Find synonyms of a word * @param word a word @@ -443,7 +443,7 @@ class Word2VecModel private[mllib] ( val vector = transform(word) findSynonyms(vector,num) } - + /** * Find synonyms of the vector representation of a word * @param vector vector representation of a word |