aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/python/ml/simple_text_classification_pipeline.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/src/main/python/ml/simple_text_classification_pipeline.py')
-rw-r--r--examples/src/main/python/ml/simple_text_classification_pipeline.py72
1 files changed, 0 insertions, 72 deletions
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
deleted file mode 100644
index b528b59be9..0000000000
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark.ml import Pipeline
-from pyspark.ml.classification import LogisticRegression
-from pyspark.ml.feature import HashingTF, Tokenizer
-from pyspark.sql import Row, SparkSession
-
-
-"""
-A simple text classification pipeline that recognizes "spark" from
-input text. This is to show how to create and configure a Spark ML
-pipeline in Python. Run with:
-
- bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py
-"""
-
-
-if __name__ == "__main__":
- spark = SparkSession\
- .builder\
- .appName("SimpleTextClassificationPipeline")\
- .getOrCreate()
-
- # Prepare training documents, which are labeled.
- training = spark.createDataFrame([
- (0, "a b c d e spark", 1.0),
- (1, "b d", 0.0),
- (2, "spark f g h", 1.0),
- (3, "hadoop mapreduce", 0.0)
- ], ["id", "text", "label"])
-
- # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
- tokenizer = Tokenizer(inputCol="text", outputCol="words")
- hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
- lr = LogisticRegression(maxIter=10, regParam=0.001)
- pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
-
- # Fit the pipeline to training documents.
- model = pipeline.fit(training)
-
- # Prepare test documents, which are unlabeled.
- test = spark.createDataFrame([
- (4, "spark i j k"),
- (5, "l m n"),
- (6, "spark hadoop spark"),
- (7, "apache hadoop")
- ], ["id", "text"])
-
- # Make predictions on test documents and print columns of interest.
- prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
- for row in selected.collect():
- print(row)
-
- spark.stop()