aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2014-10-28 03:50:22 -0700
committerXiangrui Meng <meng@databricks.com>2014-10-28 03:50:22 -0700
commitfae095bc7c4097859af522ced77f09cf6be17691 (patch)
tree8b4be3f716e8312c372105b4cc5132860bed2412 /mllib
parent46c63417c1bb1aea07baf9036cc5b8f1c3781bbe (diff)
downloadspark-fae095bc7c4097859af522ced77f09cf6be17691.tar.gz
spark-fae095bc7c4097859af522ced77f09cf6be17691.tar.bz2
spark-fae095bc7c4097859af522ced77f09cf6be17691.zip
[SPARK-3961] [MLlib] [PySpark] Python API for mllib.feature
Added completed Python API for MLlib.feature Normalizer StandardScalerModel StandardScaler HashTF IDFModel IDF cc mengxr Author: Davies Liu <davies@databricks.com> Author: Davies Liu <davies.liu@gmail.com> Closes #2819 from davies/feature and squashes the following commits: 4f48f48 [Davies Liu] add a note for HashingTF 67f6d21 [Davies Liu] address comments b628693 [Davies Liu] rollback changes in Word2Vec efb4f4f [Davies Liu] Merge branch 'master' into feature 806c7c2 [Davies Liu] address comments 3abb8c2 [Davies Liu] address comments 59781b9 [Davies Liu] Merge branch 'master' of github.com:apache/spark into feature a405ae7 [Davies Liu] fix tests 7a1891a [Davies Liu] fix tests 486795f [Davies Liu] update programming guide, HashTF -> HashingTF 8a50584 [Davies Liu] Python API for mllib.feature
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala49
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala4
3 files changed, 60 insertions, 4 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index b478c21537..485abe2723 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -31,8 +31,7 @@ import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.{PythonRDD, SerDeUtil}
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.clustering._
-import org.apache.spark.mllib.feature.Word2Vec
-import org.apache.spark.mllib.feature.Word2VecModel
+import org.apache.spark.mllib.feature._
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.random.{RandomRDDs => RG}
@@ -292,6 +291,43 @@ class PythonMLLibAPI extends Serializable {
}
/**
+ * Java stub for Normalizer.transform()
+ */
+ def normalizeVector(p: Double, vector: Vector): Vector = {
+ new Normalizer(p).transform(vector)
+ }
+
+ /**
+ * Java stub for Normalizer.transform()
+ */
+ def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = {
+ new Normalizer(p).transform(rdd)
+ }
+
+ /**
+ * Java stub for IDF.fit(). This stub returns a
+ * handle to the Java object instead of the content of the Java object.
+ * Extra care needs to be taken in the Python code to ensure it gets freed on
+ * exit; see the Py4J documentation.
+ */
+ def fitStandardScaler(
+ withMean: Boolean,
+ withStd: Boolean,
+ data: JavaRDD[Vector]): StandardScalerModel = {
+ new StandardScaler(withMean, withStd).fit(data.rdd)
+ }
+
+ /**
+ * Java stub for IDF.fit(). This stub returns a
+ * handle to the Java object instead of the content of the Java object.
+ * Extra care needs to be taken in the Python code to ensure it gets freed on
+ * exit; see the Py4J documentation.
+ */
+ def fitIDF(minDocFreq: Int, dataset: JavaRDD[Vector]): IDFModel = {
+ new IDF(minDocFreq).fit(dataset)
+ }
+
+ /**
* Java stub for Python mllib Word2Vec fit(). This stub returns a
* handle to the Java object instead of the content of the Java object.
* Extra care needs to be taken in the Python code to ensure it gets freed on
@@ -328,6 +364,15 @@ class PythonMLLibAPI extends Serializable {
model.transform(word)
}
+ /**
+ * Transforms an RDD of words to its vector representation
+ * @param rdd an RDD of words
+ * @return an RDD of vector representations of words
+ */
+ def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = {
+ rdd.rdd.map(model.transform)
+ }
+
def findSynonyms(word: String, num: Int): java.util.List[java.lang.Object] = {
val vec = transform(word)
findSynonyms(vec, num)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
index 415a845332..7358c1c84f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/VectorTransformer.scala
@@ -18,6 +18,7 @@
package org.apache.spark.mllib.feature
import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
@@ -48,4 +49,14 @@ trait VectorTransformer extends Serializable {
data.map(x => this.transform(x))
}
+ /**
+ * Applies transformation on an JavaRDD[Vector].
+ *
+ * @param data JavaRDD[Vector] to be transformed.
+ * @return transformed JavaRDD[Vector].
+ */
+ def transform(data: JavaRDD[Vector]): JavaRDD[Vector] = {
+ transform(data.rdd)
+ }
+
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index d321994c2a..f5f7ad613d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -432,7 +432,7 @@ class Word2VecModel private[mllib] (
throw new IllegalStateException(s"$word not in vocabulary")
}
}
-
+
/**
* Find synonyms of a word
* @param word a word
@@ -443,7 +443,7 @@ class Word2VecModel private[mllib] (
val vector = transform(word)
findSynonyms(vector,num)
}
-
+
/**
* Find synonyms of the vector representation of a word
* @param vector vector representation of a word