aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python/ml/tf_idf_example.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main/python/ml/tf_idf_example.py')
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index fb4ad992fb..4ab7eb6964 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -34,8 +34,10 @@ if __name__ == "__main__":
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
], ["label", "sentence"])
+
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
+
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
@@ -43,6 +45,7 @@ if __name__ == "__main__":
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
+
for features_label in rescaledData.select("features", "label").take(3):
print(features_label)
# $example off$