aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python/ml/pipeline_example.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main/python/ml/pipeline_example.py')
-rw-r--r--examples/src/main/python/ml/pipeline_example.py10
1 files changed, 6 insertions, 4 deletions
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index bd10cfd7a2..2d0865578a 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -38,12 +38,13 @@ if __name__ == "__main__":
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
- (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
+ (3L, "hadoop mapreduce", 0.0)
+ ], ["id", "text", "label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
- lr = LogisticRegression(maxIter=10, regParam=0.01)
+ lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
@@ -53,8 +54,9 @@ if __name__ == "__main__":
test = spark.createDataFrame([
(4L, "spark i j k"),
(5L, "l m n"),
- (6L, "mapreduce spark"),
- (7L, "apache hadoop")], ["id", "text"])
+ (6L, "spark hadoop spark"),
+ (7L, "apache hadoop")
+ ], ["id", "text"])
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)