aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-07-14 09:12:46 +0100
committerSean Owen <sowen@cloudera.com>2016-07-14 09:12:46 +0100
commite3f8a033679261aaee15bda0f970a1890411e743 (patch)
treefecc6121b1d5357c2214f710018de2a9ddea2786 /examples
parent252d4f27f23b547777892bcea25a2cea62d8cbab (diff)
downloadspark-e3f8a033679261aaee15bda0f970a1890411e743.tar.gz
spark-e3f8a033679261aaee15bda0f970a1890411e743.tar.bz2
spark-e3f8a033679261aaee15bda0f970a1890411e743.zip
[SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes
## What changes were proposed in this pull request? Cleanup of examples, mostly from PySpark-ML to fix minor issues: unused imports, style consistency, pipeline_example is a duplicate, use future print funciton, and a spelling error. * The "Pipeline Example" is duplicated by "Simple Text Classification Pipeline" in Scala, Python, and Java. * "Estimator Transformer Param Example" is duplicated by "Simple Params Example" in Scala, Python and Java * Synced random_forest_classifier_example.py with Scala by adding IndexToString label converted * Synced train_validation_split.py (in Scala ModelSelectionViaTrainValidationExample) by adjusting data split, adding grid for intercept. * RegexTokenizer was doing nothing in tokenizer_example.py and JavaTokenizerExample.java, synced with Scala version ## How was this patch tested? local tests and run modified examples Author: Bryan Cutler <cutlerb@gmail.com> Closes #14081 from BryanCutler/examples-cleanup-SPARK-16403.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java113
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java93
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java19
-rw-r--r--examples/src/main/python/ml/aft_survival_regression.py2
-rw-r--r--examples/src/main/python/ml/bisecting_k_means_example.py2
-rw-r--r--examples/src/main/python/ml/cross_validator.py3
-rw-r--r--examples/src/main/python/ml/dataframe_example.py11
-rw-r--r--examples/src/main/python/ml/decision_tree_classification_example.py2
-rw-r--r--examples/src/main/python/ml/estimator_transformer_param_example.py20
-rw-r--r--examples/src/main/python/ml/gaussian_mixture_example.py2
-rw-r--r--examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py2
-rw-r--r--examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py2
-rw-r--r--examples/src/main/python/ml/isotonic_regression_example.py6
-rw-r--r--examples/src/main/python/ml/kmeans_example.py4
-rw-r--r--examples/src/main/python/ml/lda_example.py3
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py5
-rw-r--r--examples/src/main/python/ml/n_gram_example.py2
-rw-r--r--examples/src/main/python/ml/naive_bayes_example.py4
-rw-r--r--examples/src/main/python/ml/one_vs_rest_example.py5
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py1
-rw-r--r--examples/src/main/python/ml/pca_example.py2
-rw-r--r--examples/src/main/python/ml/pipeline_example.py10
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py12
-rw-r--r--examples/src/main/python/ml/quantile_discretizer_example.py8
-rw-r--r--examples/src/main/python/ml/random_forest_classifier_example.py13
-rw-r--r--examples/src/main/python/ml/random_forest_regressor_example.py2
-rw-r--r--examples/src/main/python/ml/rformula_example.py2
-rw-r--r--examples/src/main/python/ml/simple_params_example.py95
-rw-r--r--examples/src/main/python/ml/simple_text_classification_pipeline.py72
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py2
-rw-r--r--examples/src/main/python/ml/string_indexer_example.py1
-rw-r--r--examples/src/main/python/ml/tf_idf_example.py3
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py13
-rw-r--r--examples/src/main/python/ml/train_validation_split.py8
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py2
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py1
-rw-r--r--examples/src/main/python/ml/word2vec_example.py2
-rw-r--r--examples/src/main/python/streaming/queue_stream.py1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala104
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala93
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala3
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala1
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala1
56 files changed, 142 insertions, 646 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
index 9a43189c91..4ccd8f6ce2 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
@@ -60,7 +60,7 @@ public class JavaPipelineExample {
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
- .setRegParam(0.01);
+ .setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
@@ -71,7 +71,7 @@ public class JavaPipelineExample {
Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new JavaDocument(4L, "spark i j k"),
new JavaDocument(5L, "l m n"),
- new JavaDocument(6L, "mapreduce spark"),
+ new JavaDocument(6L, "spark hadoop spark"),
new JavaDocument(7L, "apache hadoop")
), JavaDocument.class);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
deleted file mode 100644
index ca80d0d8bb..0000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-
-/**
- * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
- * Run with
- * {{{
- * bin/run-example ml.JavaSimpleParamsExample
- * }}}
- */
-public class JavaSimpleParamsExample {
-
- public static void main(String[] args) {
- SparkSession spark = SparkSession
- .builder()
- .appName("JavaSimpleParamsExample")
- .getOrCreate();
-
- // Prepare training data.
- // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
- // into DataFrames, where it uses the bean metadata to infer the schema.
- List<LabeledPoint> localTraining = Lists.newArrayList(
- new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
- new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
- new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
- new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
- Dataset<Row> training =
- spark.createDataFrame(localTraining, LabeledPoint.class);
-
- // Create a LogisticRegression instance. This instance is an Estimator.
- LogisticRegression lr = new LogisticRegression();
- // Print out the parameters, documentation, and any default values.
- System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
-
- // We may set parameters using setter methods.
- lr.setMaxIter(10)
- .setRegParam(0.01);
-
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
- LogisticRegressionModel model1 = lr.fit(training);
- // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
- // we can view the parameters it used during fit().
- // This prints the parameter (name: value) pairs, where names are unique IDs for this
- // LogisticRegression instance.
- System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
-
- // We may alternatively specify parameters using a ParamMap.
- ParamMap paramMap = new ParamMap();
- paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
- paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
- double[] thresholds = {0.5, 0.5};
- paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params.
-
- // One can also combine ParamMaps.
- ParamMap paramMap2 = new ParamMap();
- paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name.
- ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
-
- // Now learn a new model using the paramMapCombined parameters.
- // paramMapCombined overrides all parameters set earlier via lr.set* methods.
- LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
- System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
-
- // Prepare test documents.
- List<LabeledPoint> localTest = Lists.newArrayList(
- new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
- new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
- new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
- Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class);
-
- // Make predictions on test documents using the Transformer.transform() method.
- // LogisticRegressionModel.transform will only use the 'features' column.
- // Note that model2.transform() outputs a 'myProbability' column instead of the usual
- // 'probability' column since we renamed the lr.probabilityCol parameter previously.
- Dataset<Row> results = model2.transform(test);
- Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
- for (Row r: rows.collectAsList()) {
- System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
- + ", prediction=" + r.get(3));
- }
-
- spark.stop();
- }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
deleted file mode 100644
index 7c24c46d2e..0000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-
-/**
- * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
- * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of
- * this example {@link SimpleTextClassificationPipeline}. Run with
- * <pre>
- * bin/run-example ml.JavaSimpleTextClassificationPipeline
- * </pre>
- */
-public class JavaSimpleTextClassificationPipeline {
-
- public static void main(String[] args) {
- SparkSession spark = SparkSession
- .builder()
- .appName("JavaSimpleTextClassificationPipeline")
- .getOrCreate();
-
- // Prepare training documents, which are labeled.
- List<LabeledDocument> localTraining = Lists.newArrayList(
- new LabeledDocument(0L, "a b c d e spark", 1.0),
- new LabeledDocument(1L, "b d", 0.0),
- new LabeledDocument(2L, "spark f g h", 1.0),
- new LabeledDocument(3L, "hadoop mapreduce", 0.0));
- Dataset<Row> training =
- spark.createDataFrame(localTraining, LabeledDocument.class);
-
- // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
- Tokenizer tokenizer = new Tokenizer()
- .setInputCol("text")
- .setOutputCol("words");
- HashingTF hashingTF = new HashingTF()
- .setNumFeatures(1000)
- .setInputCol(tokenizer.getOutputCol())
- .setOutputCol("features");
- LogisticRegression lr = new LogisticRegression()
- .setMaxIter(10)
- .setRegParam(0.001);
- Pipeline pipeline = new Pipeline()
- .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
- // Fit the pipeline to training documents.
- PipelineModel model = pipeline.fit(training);
-
- // Prepare test documents, which are unlabeled.
- List<Document> localTest = Lists.newArrayList(
- new Document(4L, "spark i j k"),
- new Document(5L, "l m n"),
- new Document(6L, "spark hadoop spark"),
- new Document(7L, "apache hadoop"));
- Dataset<Row> test = spark.createDataFrame(localTest, Document.class);
-
- // Make predictions on test documents.
- Dataset<Row> predictions = model.transform(test);
- for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
- System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
- + ", prediction=" + r.get(3));
- }
-
- spark.stop();
- }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
index def5994429..278cce0842 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -47,7 +47,7 @@ public class JavaStopWordsRemoverExample {
.setOutputCol("filtered");
List<Row> data = Arrays.asList(
- RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index 1cc16bb60d..a206cef4c2 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -57,17 +57,24 @@ public class JavaTokenizerExample {
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
- Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame);
- for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) {
+ RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+
+ Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
+ for (Row r : tokenized.select("words", "label").takeAsList(3)) {
java.util.List<String> words = r.getList(0);
for (String word : words) System.out.print(word + " ");
System.out.println();
}
- RegexTokenizer regexTokenizer = new RegexTokenizer()
- .setInputCol("sentence")
- .setOutputCol("words")
- .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
+ for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
+ java.util.List<String> words = r.getList(0);
+ for (String word : words) System.out.print(word + " ");
+ System.out.println();
+ }
// $example off$
spark.stop();
}
diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py
index 060f0171ff..2f0ca995e5 100644
--- a/examples/src/main/python/ml/aft_survival_regression.py
+++ b/examples/src/main/python/ml/aft_survival_regression.py
@@ -32,7 +32,7 @@ Run with:
if __name__ == "__main__":
spark = SparkSession \
.builder \
- .appName("PythonAFTSurvivalRegressionExample") \
+ .appName("AFTSurvivalRegressionExample") \
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index ee0399ac5e..1263cb5d17 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -31,7 +31,7 @@ Run with:
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("PythonBisectingKMeansExample")\
+ .appName("BisectingKMeansExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
index a41df6cf94..907eec67a0 100644
--- a/examples/src/main/python/ml/cross_validator.py
+++ b/examples/src/main/python/ml/cross_validator.py
@@ -24,7 +24,7 @@ from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# $example off$
-from pyspark.sql import Row, SparkSession
+from pyspark.sql import SparkSession
"""
A simple example demonstrating model selection using CrossValidator.
@@ -39,6 +39,7 @@ if __name__ == "__main__":
.builder\
.appName("CrossValidatorExample")\
.getOrCreate()
+
# $example on$
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
diff --git a/examples/src/main/python/ml/dataframe_example.py b/examples/src/main/python/ml/dataframe_example.py
index c1818d72fe..109f901012 100644
--- a/examples/src/main/python/ml/dataframe_example.py
+++ b/examples/src/main/python/ml/dataframe_example.py
@@ -34,15 +34,16 @@ if __name__ == "__main__":
if len(sys.argv) > 2:
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1)
- spark = SparkSession\
- .builder\
- .appName("DataFrameExample")\
- .getOrCreate()
- if len(sys.argv) == 2:
+ elif len(sys.argv) == 2:
input = sys.argv[1]
else:
input = "data/mllib/sample_libsvm_data.txt"
+ spark = SparkSession \
+ .builder \
+ .appName("DataFrameExample") \
+ .getOrCreate()
+
# Load input data
print("Loading LIBSVM file with UDT from " + input + ".")
df = spark.read.format("libsvm").load(input).cache()
diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py
index 708f1af6cc..d6e2977de0 100644
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("decision_tree_classification_example")\
+ .appName("DecisionTreeClassificationExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/estimator_transformer_param_example.py b/examples/src/main/python/ml/estimator_transformer_param_example.py
index 3bd3fd30f8..eb21051435 100644
--- a/examples/src/main/python/ml/estimator_transformer_param_example.py
+++ b/examples/src/main/python/ml/estimator_transformer_param_example.py
@@ -18,6 +18,7 @@
"""
Estimator Transformer Param Example.
"""
+from __future__ import print_function
# $example on$
from pyspark.ml.linalg import Vectors
@@ -42,7 +43,7 @@ if __name__ == "__main__":
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
- print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
+ print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
@@ -51,8 +52,8 @@ if __name__ == "__main__":
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
- print "Model 1 was fit using parameters: "
- print model1.extractParamMap()
+ print("Model 1 was fit using parameters: ")
+ print(model1.extractParamMap())
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
@@ -67,8 +68,8 @@ if __name__ == "__main__":
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)
- print "Model 2 was fit using parameters: "
- print model2.extractParamMap()
+ print("Model 2 was fit using parameters: ")
+ print(model2.extractParamMap())
# Prepare test data
test = spark.createDataFrame([
@@ -81,9 +82,12 @@ if __name__ == "__main__":
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
- selected = prediction.select("features", "label", "myProbability", "prediction")
- for row in selected.collect():
- print row
+ result = prediction.select("features", "label", "myProbability", "prediction") \
+ .collect()
+
+ for row in result:
+ print("features=%s, label=%s -> prob=%s, prediction=%s"
+ % (row.features, row.label, row.myProbability, row.prediction))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
index 2ca13d68f6..edc258de05 100644
--- a/examples/src/main/python/ml/gaussian_mixture_example.py
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -31,7 +31,7 @@ Run with:
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("PythonGuassianMixtureExample")\
+ .appName("GaussianMixtureExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
index 6c2d7e7b81..c2042fd7b7 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_classifier_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("gradient_boosted_tree_classifier_example")\
+ .appName("GradientBoostedTreeClassifierExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
index 5dd2272748..cc96c973e4 100644
--- a/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
+++ b/examples/src/main/python/ml/gradient_boosted_tree_regressor_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("gradient_boosted_tree_regressor_example")\
+ .appName("GradientBoostedTreeRegressorExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/isotonic_regression_example.py b/examples/src/main/python/ml/isotonic_regression_example.py
index 1e61bd8eff..a41b8ffacb 100644
--- a/examples/src/main/python/ml/isotonic_regression_example.py
+++ b/examples/src/main/python/ml/isotonic_regression_example.py
@@ -21,7 +21,7 @@ Isotonic Regression Example.
from __future__ import print_function
# $example on$
-from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel
+from pyspark.ml.regression import IsotonicRegression
# $example off$
from pyspark.sql import SparkSession
@@ -30,11 +30,11 @@ An example demonstrating isotonic regression.
Run with:
bin/spark-submit examples/src/main/python/ml/isotonic_regression_example.py
"""
-if __name__ == "__main__":
+if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("PythonIsotonicRegressionExample")\
+ .appName("IsotonicRegressionExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index 4b8b7291f9..6846ec4599 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -31,12 +31,10 @@ Run with:
This example requires NumPy (http://www.numpy.org/).
"""
-
if __name__ == "__main__":
-
spark = SparkSession\
.builder\
- .appName("PythonKMeansExample")\
+ .appName("KMeansExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/lda_example.py b/examples/src/main/python/ml/lda_example.py
index 5ce810fccc..2dc1742ff7 100644
--- a/examples/src/main/python/ml/lda_example.py
+++ b/examples/src/main/python/ml/lda_example.py
@@ -23,16 +23,13 @@ from pyspark.ml.clustering import LDA
# $example off$
from pyspark.sql import SparkSession
-
"""
An example demonstrating LDA.
Run with:
bin/spark-submit examples/src/main/python/ml/lda_example.py
"""
-
if __name__ == "__main__":
- # Creates a SparkSession
spark = SparkSession \
.builder \
.appName("LDAExample") \
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index aa33bef5a3..2cc38c2855 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -31,18 +31,23 @@ if __name__ == "__main__":
# Load training data
data = spark.read.format("libsvm")\
.load("data/mllib/sample_multiclass_classification_data.txt")
+
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]
+
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]
+
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
+
# train the model
model = trainer.fit(train)
+
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
index 9ac07f2c8e..55263adb46 100644
--- a/examples/src/main/python/ml/n_gram_example.py
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -34,8 +34,10 @@ if __name__ == "__main__":
(1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
(2, ["Logistic", "regression", "models", "are", "neat"])
], ["label", "words"])
+
ngram = NGram(inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)
+
for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
print(ngrams_label)
# $example off$
diff --git a/examples/src/main/python/ml/naive_bayes_example.py b/examples/src/main/python/ml/naive_bayes_example.py
index 8bc32222fe..aa23f298c8 100644
--- a/examples/src/main/python/ml/naive_bayes_example.py
+++ b/examples/src/main/python/ml/naive_bayes_example.py
@@ -26,13 +26,14 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("naive_bayes_example")\
+ .appName("NaiveBayesExample")\
.getOrCreate()
# $example on$
# Load training data
data = spark.read.format("libsvm") \
.load("data/mllib/sample_libsvm_data.txt")
+
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
@@ -43,6 +44,7 @@ if __name__ == "__main__":
# train the model
model = nb.fit(train)
+
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
diff --git a/examples/src/main/python/ml/one_vs_rest_example.py b/examples/src/main/python/ml/one_vs_rest_example.py
index b82087beba..8e00c25d93 100644
--- a/examples/src/main/python/ml/one_vs_rest_example.py
+++ b/examples/src/main/python/ml/one_vs_rest_example.py
@@ -30,11 +30,10 @@ Run with:
bin/spark-submit examples/src/main/python/ml/one_vs_rest_example.py
"""
-
if __name__ == "__main__":
spark = SparkSession \
.builder \
- .appName("PythonOneVsRestExample") \
+ .appName("OneVsRestExample") \
.getOrCreate()
# $example on$
@@ -62,7 +61,7 @@ if __name__ == "__main__":
# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
- print("Test Error : " + str(1 - accuracy))
+ print("Test Error = %g" % (1.0 - accuracy))
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
index b9fceef68e..47faf8d202 100644
--- a/examples/src/main/python/ml/onehot_encoder_example.py
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -41,6 +41,7 @@ if __name__ == "__main__":
stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)
+
encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.transform(indexed)
encoded.select("id", "categoryVec").show()
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
index 414629ff88..38746aced0 100644
--- a/examples/src/main/python/ml/pca_example.py
+++ b/examples/src/main/python/ml/pca_example.py
@@ -34,8 +34,10 @@ if __name__ == "__main__":
(Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
(Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data, ["features"])
+
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
+
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)
# $example off$
diff --git a/examples/src/main/python/ml/pipeline_example.py b/examples/src/main/python/ml/pipeline_example.py
index bd10cfd7a2..2d0865578a 100644
--- a/examples/src/main/python/ml/pipeline_example.py
+++ b/examples/src/main/python/ml/pipeline_example.py
@@ -38,12 +38,13 @@ if __name__ == "__main__":
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
- (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
+ (3L, "hadoop mapreduce", 0.0)
+ ], ["id", "text", "label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
- lr = LogisticRegression(maxIter=10, regParam=0.01)
+ lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
@@ -53,8 +54,9 @@ if __name__ == "__main__":
test = spark.createDataFrame([
(4L, "spark i j k"),
(5L, "l m n"),
- (6L, "mapreduce spark"),
- (7L, "apache hadoop")], ["id", "text"])
+ (6L, "spark hadoop spark"),
+ (7L, "apache hadoop")
+ ], ["id", "text"])
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
index b46c1ba2f4..b464ee86b6 100644
--- a/examples/src/main/python/ml/polynomial_expansion_example.py
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -30,13 +30,15 @@ if __name__ == "__main__":
.getOrCreate()
# $example on$
- df = spark\
- .createDataFrame([(Vectors.dense([-2.0, 2.3]),),
- (Vectors.dense([0.0, 0.0]),),
- (Vectors.dense([0.6, -1.1]),)],
- ["features"])
+ df = spark.createDataFrame([
+ (Vectors.dense([-2.0, 2.3]),),
+ (Vectors.dense([0.0, 0.0]),),
+ (Vectors.dense([0.6, -1.1]),)
+ ], ["features"])
+
px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures")
polyDF = px.transform(df)
+
for expanded in polyDF.select("polyFeatures").take(3):
print(expanded)
# $example off$
diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py
index 6f422f840a..788a0baffe 100644
--- a/examples/src/main/python/ml/quantile_discretizer_example.py
+++ b/examples/src/main/python/ml/quantile_discretizer_example.py
@@ -22,18 +22,22 @@ from pyspark.ml.feature import QuantileDiscretizer
# $example off$
from pyspark.sql import SparkSession
-
if __name__ == "__main__":
- spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate()
+ spark = SparkSession\
+ .builder\
+ .appName("QuantileDiscretizerExample")\
+ .getOrCreate()
# $example on$
data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)]
df = spark.createDataFrame(data, ["id", "hour"])
# $example off$
+
# Output of QuantileDiscretizer for such small datasets can depend on the number of
# partitions. Here we force a single partition to ensure consistent results.
# Note this is not necessary for normal use cases
df = df.repartition(1)
+
# $example on$
discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")
diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py
index eb9ded9af5..4eaa94dd7f 100644
--- a/examples/src/main/python/ml/random_forest_classifier_example.py
+++ b/examples/src/main/python/ml/random_forest_classifier_example.py
@@ -23,7 +23,7 @@ from __future__ import print_function
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
-from pyspark.ml.feature import StringIndexer, VectorIndexer
+from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# $example off$
from pyspark.sql import SparkSession
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("random_forest_classifier_example")\
+ .appName("RandomForestClassifierExample")\
.getOrCreate()
# $example on$
@@ -41,6 +41,7 @@ if __name__ == "__main__":
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
+
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
@@ -52,8 +53,12 @@ if __name__ == "__main__":
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
+ # Convert indexed labels back to original labels.
+ labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
+ labels=labelIndexer.labels)
+
# Chain indexers and forest in a Pipeline
- pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
+ pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])
# Train model. This also runs the indexers.
model = pipeline.fit(trainingData)
@@ -62,7 +67,7 @@ if __name__ == "__main__":
predictions = model.transform(testData)
# Select example rows to display.
- predictions.select("prediction", "indexedLabel", "features").show(5)
+ predictions.select("predictedLabel", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
diff --git a/examples/src/main/python/ml/random_forest_regressor_example.py b/examples/src/main/python/ml/random_forest_regressor_example.py
index 3a793737db..a34edff2ec 100644
--- a/examples/src/main/python/ml/random_forest_regressor_example.py
+++ b/examples/src/main/python/ml/random_forest_regressor_example.py
@@ -31,7 +31,7 @@ from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
- .appName("random_forest_regressor_example")\
+ .appName("RandomForestRegressorExample")\
.getOrCreate()
# $example on$
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
index d5df3ce4f5..6629239db2 100644
--- a/examples/src/main/python/ml/rformula_example.py
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -34,10 +34,12 @@ if __name__ == "__main__":
(8, "CA", 12, 0.0),
(9, "NZ", 15, 0.0)],
["id", "country", "hour", "clicked"])
+
formula = RFormula(
formula="clicked ~ country + hour",
featuresCol="features",
labelCol="label")
+
output = formula.fit(dataset).transform(dataset)
output.select("features", "label").show()
# $example off$
diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py
deleted file mode 100644
index 2f1eaa6f94..0000000000
--- a/examples/src/main/python/ml/simple_params_example.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-import pprint
-import sys
-
-from pyspark.ml.classification import LogisticRegression
-from pyspark.ml.linalg import DenseVector
-from pyspark.sql import Row, SparkSession
-
-"""
-A simple example demonstrating ways to specify parameters for Estimators and Transformers.
-Run with:
- bin/spark-submit examples/src/main/python/ml/simple_params_example.py
-"""
-
-if __name__ == "__main__":
- spark = SparkSession \
- .builder \
- .appName("SimpleParamsExample") \
- .getOrCreate()
-
- # prepare training data.
- # We create an RDD of LabeledPoints and convert them into a DataFrame.
- # A LabeledPoint is an Object with two fields named label and features
- # and Spark SQL identifies these fields and creates the schema appropriately.
- training = spark.createDataFrame([
- Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])),
- Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])),
- Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])),
- Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))])
-
- # Create a LogisticRegression instance with maxIter = 10.
- # This instance is an Estimator.
- lr = LogisticRegression(maxIter=10)
- # Print out the parameters, documentation, and any default values.
- print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
-
- # We may also set parameters using setter methods.
- lr.setRegParam(0.01)
-
- # Learn a LogisticRegression model. This uses the parameters stored in lr.
- model1 = lr.fit(training)
-
- # Since model1 is a Model (i.e., a Transformer produced by an Estimator),
- # we can view the parameters it used during fit().
- # This prints the parameter (name: value) pairs, where names are unique IDs for this
- # LogisticRegression instance.
- print("Model 1 was fit using parameters:\n")
- pprint.pprint(model1.extractParamMap())
-
- # We may alternatively specify parameters using a parameter map.
- # paramMap overrides all lr parameters set earlier.
- paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"}
-
- # Now learn a new model using the new parameters.
- model2 = lr.fit(training, paramMap)
- print("Model 2 was fit using parameters:\n")
- pprint.pprint(model2.extractParamMap())
-
- # prepare test data.
- test = spark.createDataFrame([
- Row(label=1.0, features=DenseVector([-1.0, 1.5, 1.3])),
- Row(label=0.0, features=DenseVector([3.0, 2.0, -0.1])),
- Row(label=0.0, features=DenseVector([0.0, 2.2, -1.5]))])
-
- # Make predictions on test data using the Transformer.transform() method.
- # LogisticRegressionModel.transform will only use the 'features' column.
- # Note that model2.transform() outputs a 'myProbability' column instead of the usual
- # 'probability' column since we renamed the lr.probabilityCol parameter previously.
- result = model2.transform(test) \
- .select("features", "label", "myProbability", "prediction") \
- .collect()
-
- for row in result:
- print("features=%s,label=%s -> prob=%s, prediction=%s"
- % (row.features, row.label, row.myProbability, row.prediction))
-
- spark.stop()
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
deleted file mode 100644
index b528b59be9..0000000000
--- a/examples/src/main/python/ml/simple_text_classification_pipeline.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-
-from pyspark.ml import Pipeline
-from pyspark.ml.classification import LogisticRegression
-from pyspark.ml.feature import HashingTF, Tokenizer
-from pyspark.sql import Row, SparkSession
-
-
-"""
-A simple text classification pipeline that recognizes "spark" from
-input text. This is to show how to create and configure a Spark ML
-pipeline in Python. Run with:
-
- bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py
-"""
-
-
-if __name__ == "__main__":
- spark = SparkSession\
- .builder\
- .appName("SimpleTextClassificationPipeline")\
- .getOrCreate()
-
- # Prepare training documents, which are labeled.
- training = spark.createDataFrame([
- (0, "a b c d e spark", 1.0),
- (1, "b d", 0.0),
- (2, "spark f g h", 1.0),
- (3, "hadoop mapreduce", 0.0)
- ], ["id", "text", "label"])
-
- # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
- tokenizer = Tokenizer(inputCol="text", outputCol="words")
- hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features")
- lr = LogisticRegression(maxIter=10, regParam=0.001)
- pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
-
- # Fit the pipeline to training documents.
- model = pipeline.fit(training)
-
- # Prepare test documents, which are unlabeled.
- test = spark.createDataFrame([
- (4, "spark i j k"),
- (5, "l m n"),
- (6, "spark hadoop spark"),
- (7, "apache hadoop")
- ], ["id", "text"])
-
- # Make predictions on test documents and print columns of interest.
- prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
- for row in selected.collect():
- print(row)
-
- spark.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
index 395fdeffc5..8a8392cc1f 100644
--- a/examples/src/main/python/ml/stopwords_remover_example.py
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -30,7 +30,7 @@ if __name__ == "__main__":
# $example on$
sentenceData = spark.createDataFrame([
- (0, ["I", "saw", "the", "red", "baloon"]),
+ (0, ["I", "saw", "the", "red", "balloon"]),
(1, ["Mary", "had", "a", "little", "lamb"])
], ["label", "raw"])
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
index a328e040f5..2255bfb9c1 100644
--- a/examples/src/main/python/ml/string_indexer_example.py
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -32,6 +32,7 @@ if __name__ == "__main__":
df = spark.createDataFrame(
[(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
["id", "category"])
+
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()
diff --git a/examples/src/main/python/ml/tf_idf_example.py b/examples/src/main/python/ml/tf_idf_example.py
index fb4ad992fb..4ab7eb6964 100644
--- a/examples/src/main/python/ml/tf_idf_example.py
+++ b/examples/src/main/python/ml/tf_idf_example.py
@@ -34,8 +34,10 @@ if __name__ == "__main__":
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
], ["label", "sentence"])
+
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
+
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
@@ -43,6 +45,7 @@ if __name__ == "__main__":
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
+
for features_label in rescaledData.select("features", "label").take(3):
print(features_label)
# $example off$
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
index e61ec920d2..89f5060705 100644
--- a/examples/src/main/python/ml/tokenizer_example.py
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -34,12 +34,19 @@ if __name__ == "__main__":
(1, "I wish Java could use case classes"),
(2, "Logistic,regression,models,are,neat")
], ["label", "sentence"])
+
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
- wordsDataFrame = tokenizer.transform(sentenceDataFrame)
- for words_label in wordsDataFrame.select("words", "label").take(3):
- print(words_label)
+
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)
+
+ tokenized = tokenizer.transform(sentenceDataFrame)
+ for words_label in tokenized.select("words", "label").take(3):
+ print(words_label)
+
+ regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+ for words_label in regexTokenized.select("words", "label").take(3):
+ print(words_label)
# $example off$
spark.stop()
diff --git a/examples/src/main/python/ml/train_validation_split.py b/examples/src/main/python/ml/train_validation_split.py
index 5f5c52aca8..a92b861f83 100644
--- a/examples/src/main/python/ml/train_validation_split.py
+++ b/examples/src/main/python/ml/train_validation_split.py
@@ -35,18 +35,21 @@ if __name__ == "__main__":
.builder\
.appName("TrainValidationSplit")\
.getOrCreate()
+
# $example on$
# Prepare training and test data.
data = spark.read.format("libsvm")\
.load("data/mllib/sample_linear_regression_data.txt")
- train, test = data.randomSplit([0.7, 0.3])
- lr = LinearRegression(maxIter=10, regParam=0.1)
+ train, test = data.randomSplit([0.9, 0.1], seed=12345)
+
+ lr = LinearRegression(maxIter=10)
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
.addGrid(lr.regParam, [0.1, 0.01]) \
+ .addGrid(lr.fitIntercept, [False, True])\
.addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
.build()
@@ -60,6 +63,7 @@ if __name__ == "__main__":
# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)
+
# Make predictions on test data. model is the model with combination of parameters
# that performed best.
prediction = model.transform(test)
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
index bbfc316ff2..eac33711ad 100644
--- a/examples/src/main/python/ml/vector_assembler_example.py
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -33,9 +33,11 @@ if __name__ == "__main__":
dataset = spark.createDataFrame(
[(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
["id", "hour", "mobile", "userFeatures", "clicked"])
+
assembler = VectorAssembler(
inputCols=["hour", "mobile", "userFeatures"],
outputCol="features")
+
output = assembler.transform(dataset)
print(output.select("features", "clicked").first())
# $example off$
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
index 9b00e0f841..3912c135be 100644
--- a/examples/src/main/python/ml/vector_indexer_example.py
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -30,6 +30,7 @@ if __name__ == "__main__":
# $example on$
data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
diff --git a/examples/src/main/python/ml/word2vec_example.py b/examples/src/main/python/ml/word2vec_example.py
index 66500bee15..78a91c92fc 100644
--- a/examples/src/main/python/ml/word2vec_example.py
+++ b/examples/src/main/python/ml/word2vec_example.py
@@ -35,9 +35,11 @@ if __name__ == "__main__":
("I wish Java could use case classes".split(" "), ),
("Logistic regression models are neat".split(" "), )
], ["text"])
+
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
+
result = model.transform(documentDF)
for feature in result.select("result").take(3):
print(feature)
diff --git a/examples/src/main/python/streaming/queue_stream.py b/examples/src/main/python/streaming/queue_stream.py
index b3808907f7..bdd2d48519 100644
--- a/examples/src/main/python/streaming/queue_stream.py
+++ b/examples/src/main/python/streaming/queue_stream.py
@@ -22,7 +22,6 @@
To run this example use
`$ bin/spark-submit examples/src/main/python/streaming/queue_stream.py
"""
-import sys
import time
from pyspark import SparkContext
diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
index a68fd0285f..86eed3867c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
@@ -18,7 +18,6 @@
// scalastyle:off println
package org.apache.spark.examples
-import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
/**
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
index 2c2bf421bc..26095b46f5 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
@@ -33,8 +33,10 @@ import org.apache.spark.sql.SparkSession
*/
object GaussianMixtureExample {
def main(args: Array[String]): Unit = {
- // Creates a SparkSession
- val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate()
+ val spark = SparkSession
+ .builder
+ .appName(s"${this.getClass.getSimpleName}")
+ .getOrCreate()
// $example on$
// Loads data
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
index 7c5d3f2341..a840559d24 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/IsotonicRegressionExample.scala
@@ -33,8 +33,6 @@ import org.apache.spark.sql.SparkSession
object IsotonicRegressionExample {
def main(args: Array[String]): Unit = {
-
- // Creates a SparkSession.
val spark = SparkSession
.builder
.appName(s"${this.getClass.getSimpleName}")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
index 2341b36db2..a1d19e138d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.SparkSession
object KMeansExample {
def main(args: Array[String]): Unit = {
- // Creates a SparkSession.
val spark = SparkSession
.builder
.appName(s"${this.getClass.getSimpleName}")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
index 75fef2922a..1cd2641f9a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ModelSelectionViaTrainValidationSplitExample.scala
@@ -46,6 +46,7 @@ object ModelSelectionViaTrainValidationSplitExample {
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
val lr = new LinearRegression()
+ .setMaxIter(10)
// We use a ParamGridBuilder to construct a grid of parameters to search over.
// TrainValidationSplit will try all combinations of values and determine best model using
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index e8a9b32da9..a39e3202ba 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -39,27 +39,33 @@ object MultilayerPerceptronClassifierExample {
// Load the data stored in LIBSVM format as a DataFrame.
val data = spark.read.format("libsvm")
.load("data/mllib/sample_multiclass_classification_data.txt")
+
// Split the data into train and test
val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L)
val train = splits(0)
val test = splits(1)
+
// specify layers for the neural network:
// input layer of size 4 (features), two intermediate of size 5 and 4
// and output of size 3 (classes)
val layers = Array[Int](4, 5, 4, 3)
+
// create the trainer and set its parameters
val trainer = new MultilayerPerceptronClassifier()
.setLayers(layers)
.setBlockSize(128)
.setSeed(1234L)
.setMaxIter(100)
+
// train the model
val model = trainer.fit(train)
+
// compute accuracy on the test set
val result = model.transform(test)
val predictionAndLabels = result.select("prediction", "label")
val evaluator = new MulticlassClassificationEvaluator()
.setMetricName("accuracy")
+
println("Accuracy: " + evaluator.evaluate(predictionAndLabels))
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index 7089a4bc87..3ae0623c4c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -30,6 +30,7 @@ object NaiveBayesExample {
.builder
.appName("NaiveBayesExample")
.getOrCreate()
+
// $example on$
// Load the data stored in LIBSVM format as a DataFrame.
val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala
index b16692b1fa..12f8663b9c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PipelineExample.scala
@@ -54,7 +54,7 @@ object PipelineExample {
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
- .setRegParam(0.01)
+ .setRegParam(0.001)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
@@ -74,7 +74,7 @@ object PipelineExample {
val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
- (6L, "mapreduce spark"),
+ (6L, "spark hadoop spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
index 2f7e217b8f..aedb9e7d3b 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
@@ -28,16 +28,16 @@ object QuantileDiscretizerExample {
.builder
.appName("QuantileDiscretizerExample")
.getOrCreate()
- import spark.implicits._
// $example on$
val data = Array((0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2))
- var df = spark.createDataFrame(data).toDF("id", "hour")
+ val df = spark.createDataFrame(data).toDF("id", "hour")
// $example off$
// Output of QuantileDiscretizer for such small datasets can depend on the number of
// partitions. Here we force a single partition to ensure consistent results.
// Note this is not necessary for normal use cases
.repartition(1)
+
// $example on$
val discretizer = new QuantileDiscretizer()
.setInputCol("hour")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
index 9ea4920146..3498fa8a50 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
@@ -36,10 +36,12 @@ object RFormulaExample {
(8, "CA", 12, 0.0),
(9, "NZ", 15, 0.0)
)).toDF("id", "country", "hour", "clicked")
+
val formula = new RFormula()
.setFormula("clicked ~ country + hour")
.setFeaturesCol("features")
.setLabelCol("label")
+
val output = formula.fit(dataset).transform(dataset)
output.select("features", "label").show()
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
deleted file mode 100644
index 29f1f50960..0000000000
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.ml
-
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.feature.LabeledPoint
-import org.apache.spark.ml.linalg.{Vector, Vectors}
-import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.sql.{Row, SparkSession}
-
-/**
- * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
- * Run with
- * {{{
- * bin/run-example ml.SimpleParamsExample
- * }}}
- */
-object SimpleParamsExample {
-
- def main(args: Array[String]) {
- val spark = SparkSession
- .builder
- .appName("SimpleParamsExample")
- .getOrCreate()
- import spark.implicits._
-
- // Prepare training data.
- // We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of case classes
- // into DataFrames, where it uses the case class metadata to infer the schema.
- val training = spark.createDataFrame(Seq(
- LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
- LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
- LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
- LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
-
- // Create a LogisticRegression instance. This instance is an Estimator.
- val lr = new LogisticRegression()
- // Print out the parameters, documentation, and any default values.
- println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
-
- // We may set parameters using setter methods.
- lr.setMaxIter(10)
- .setRegParam(0.01)
-
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
- val model1 = lr.fit(training)
- // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
- // we can view the parameters it used during fit().
- // This prints the parameter (name: value) pairs, where names are unique IDs for this
- // LogisticRegression instance.
- println("Model 1 was fit using parameters: " + model1.parent.extractParamMap())
-
- // We may alternatively specify parameters using a ParamMap,
- // which supports several methods for specifying parameters.
- val paramMap = ParamMap(lr.maxIter -> 20)
- paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
- paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.5, 0.5)) // Specify multiple Params.
-
- // One can also combine ParamMaps.
- val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
- val paramMapCombined = paramMap ++ paramMap2
-
- // Now learn a new model using the paramMapCombined parameters.
- // paramMapCombined overrides all parameters set earlier via lr.set* methods.
- val model2 = lr.fit(training.toDF(), paramMapCombined)
- println("Model 2 was fit using parameters: " + model2.parent.extractParamMap())
-
- // Prepare test data.
- val test = spark.createDataFrame(Seq(
- LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
- LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
- LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
-
- // Make predictions on test data using the Transformer.transform() method.
- // LogisticRegressionModel.transform will only use the 'features' column.
- // Note that model2.transform() outputs a 'myProbability' column instead of the usual
- // 'probability' column since we renamed the lr.probabilityCol parameter previously.
- model2.transform(test)
- .select("features", "label", "myProbability", "prediction")
- .collect()
- .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
- println(s"($features, $label) -> prob=$prob, prediction=$prediction")
- }
-
- spark.stop()
- }
-}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
deleted file mode 100644
index 0b2a058bb6..0000000000
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.ml
-
-import scala.beans.BeanInfo
-
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.LogisticRegression
-import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
-import org.apache.spark.ml.linalg.Vector
-import org.apache.spark.sql.{Row, SparkSession}
-
-@BeanInfo
-case class LabeledDocument(id: Long, text: String, label: Double)
-
-@BeanInfo
-case class Document(id: Long, text: String)
-
-/**
- * A simple text classification pipeline that recognizes "spark" from input text. This is to show
- * how to create and configure an ML pipeline. Run with
- * {{{
- * bin/run-example ml.SimpleTextClassificationPipeline
- * }}}
- */
-object SimpleTextClassificationPipeline {
-
- def main(args: Array[String]) {
- val spark = SparkSession
- .builder
- .appName("SimpleTextClassificationPipeline")
- .getOrCreate()
- import spark.implicits._
-
- // Prepare training documents, which are labeled.
- val training = spark.createDataFrame(Seq(
- LabeledDocument(0L, "a b c d e spark", 1.0),
- LabeledDocument(1L, "b d", 0.0),
- LabeledDocument(2L, "spark f g h", 1.0),
- LabeledDocument(3L, "hadoop mapreduce", 0.0)))
-
- // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
- val tokenizer = new Tokenizer()
- .setInputCol("text")
- .setOutputCol("words")
- val hashingTF = new HashingTF()
- .setNumFeatures(1000)
- .setInputCol(tokenizer.getOutputCol)
- .setOutputCol("features")
- val lr = new LogisticRegression()
- .setMaxIter(10)
- .setRegParam(0.001)
- val pipeline = new Pipeline()
- .setStages(Array(tokenizer, hashingTF, lr))
-
- // Fit the pipeline to training documents.
- val model = pipeline.fit(training.toDF())
-
- // Prepare test documents, which are unlabeled.
- val test = spark.createDataFrame(Seq(
- Document(4L, "spark i j k"),
- Document(5L, "l m n"),
- Document(6L, "spark hadoop spark"),
- Document(7L, "apache hadoop")))
-
- // Make predictions on test documents.
- model.transform(test.toDF())
- .select("id", "text", "probability", "prediction")
- .collect()
- .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
- println(s"($id, $text) --> prob=$prob, prediction=$prediction")
- }
-
- spark.stop()
- }
-}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
index fb1a43e962..a56de0856d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -36,7 +36,7 @@ object StopWordsRemoverExample {
.setOutputCol("filtered")
val dataSet = spark.createDataFrame(Seq(
- (0, Seq("I", "saw", "the", "red", "baloon")),
+ (0, Seq("I", "saw", "the", "red", "balloon")),
(1, Seq("Mary", "had", "a", "little", "lamb"))
)).toDF("id", "raw")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
index 33b5daec59..97f6fcce15 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TfIdfExample.scala
@@ -40,13 +40,16 @@ object TfIdfExample {
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
+
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
+
val featurizedData = hashingTF.transform(wordsData)
// alternatively, CountVectorizer can also be used to get term frequency vectors
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
+
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
index 1c70dc700b..90d0faaf47 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -45,6 +45,7 @@ object TokenizerExample {
val tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("words", "label").take(3).foreach(println)
+
val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("words", "label").take(3).foreach(println)
// $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
index 9ac5623607..5c8bd19f20 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala
@@ -45,6 +45,7 @@ object Word2VecExample {
.setVectorSize(3)
.setMinCount(0)
val model = word2Vec.fit(documentDF)
+
val result = model.transform(documentDF)
result.select("result").take(3).foreach(println)
// $example off$