diff options
Diffstat (limited to 'examples/src/main/python/ml/tf_idf_example.py')
-rw-r--r-- | examples/src/main/python/ml/tf_idf_example.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py index fb4ad992fb..4ab7eb6964 100644 --- a/examples/src/main/python/ml/tf_idf_example.py +++ b/examples/src/main/python/ml/tf_idf_example.py @@ -34,8 +34,10 @@ if __name__ == "__main__": (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") ], ["label", "sentence"]) + tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) + hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors @@ -43,6 +45,7 @@ if __name__ == "__main__": idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) + for features_label in rescaledData.select("features", "label").take(3): print(features_label) # $example off$ |