aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorYuhao Yang <hhbyyh@gmail.com>2016-04-20 11:45:08 +0100
committerSean Owen <sowen@cloudera.com>2016-04-20 11:45:08 +0100
commited9d80385486cd39a84a689ef467795262af919a (patch)
tree954f979137825630508203428c0f6e6869373138 /examples
parent17db4bfeaa0074298db622db38a5b0459518c4a9 (diff)
downloadspark-ed9d80385486cd39a84a689ef467795262af919a.tar.gz
spark-ed9d80385486cd39a84a689ef467795262af919a.tar.bz2
spark-ed9d80385486cd39a84a689ef467795262af919a.zip
[SPARK-14635][ML] Documentation and Examples for TF-IDF only refer to HashingTF
## What changes were proposed in this pull request? Currently, the docs for TF-IDF only refer to using HashingTF with IDF. However, CountVectorizer can also be used. We should probably amend the user guide and examples to show this. ## How was this patch tested? unit tests and doc generation Author: Yuhao Yang <hhbyyh@gmail.com> Closes #12454 from hhbyyh/tfdoc.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java2
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala2
3 files changed, 6 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
index 37a3d0d84d..107c835f2e 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTfIdfExample.java
@@ -63,6 +63,8 @@ public class JavaTfIdfExample {
.setOutputCol("rawFeatures")
.setNumFeatures(numFeatures);
Dataset<Row> featurizedData = hashingTF.transform(wordsData);
+ // alternatively, CountVectorizer can also be used to get term frequency vectors
+
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
Dataset<Row> rescaledData = idfModel.transform(featurizedData);
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index c92313378e..141324d458 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -37,6 +37,8 @@ if __name__ == "__main__":
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
+ # alternatively, CountVectorizer can also be used to get term frequency vectors
+
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
index 28115f9390..396f073e6b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -43,6 +43,8 @@ object TfIdfExample {
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
+ // alternatively, CountVectorizer can also be used to get term frequency vectors
+
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)