From ed9d80385486cd39a84a689ef467795262af919a Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 20 Apr 2016 11:45:08 +0100 Subject: [SPARK-14635][ML] Documentation and Examples for TF-IDF only refer to HashingTF ## What changes were proposed in this pull request? Currently, the docs for TF-IDF only refer to using HashingTF with IDF. However, CountVectorizer can also be used. We should probably amend the user guide and examples to show this. ## How was this patch tested? unit tests and doc generation Author: Yuhao Yang Closes #12454 from hhbyyh/tfdoc. --- examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala | 2 ++ 1 file changed, 2 insertions(+) (limited to 'examples/src/main/scala') diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala index 28115f9390..396f073e6b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala @@ -43,6 +43,8 @@ object TfIdfExample { val hashingTF = new HashingTF() .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20) val featurizedData = hashingTF.transform(wordsData) + // alternatively, CountVectorizer can also be used to get term frequency vectors + val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) -- cgit v1.2.3