aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main/java
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-07-14 09:12:46 +0100
committerSean Owen <sowen@cloudera.com>2016-07-14 09:12:46 +0100
commite3f8a033679261aaee15bda0f970a1890411e743 (patch)
treefecc6121b1d5357c2214f710018de2a9ddea2786 /examples/src/main/java
parent252d4f27f23b547777892bcea25a2cea62d8cbab (diff)
downloadspark-e3f8a033679261aaee15bda0f970a1890411e743.tar.gz
spark-e3f8a033679261aaee15bda0f970a1890411e743.tar.bz2
spark-e3f8a033679261aaee15bda0f970a1890411e743.zip
[SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes
## What changes were proposed in this pull request? Cleanup of examples, mostly from PySpark-ML to fix minor issues: unused imports, style consistency, pipeline_example is a duplicate, use future print funciton, and a spelling error. * The "Pipeline Example" is duplicated by "Simple Text Classification Pipeline" in Scala, Python, and Java. * "Estimator Transformer Param Example" is duplicated by "Simple Params Example" in Scala, Python and Java * Synced random_forest_classifier_example.py with Scala by adding IndexToString label converted * Synced train_validation_split.py (in Scala ModelSelectionViaTrainValidationExample) by adjusting data split, adding grid for intercept. * RegexTokenizer was doing nothing in tokenizer_example.py and JavaTokenizerExample.java, synced with Scala version ## How was this patch tested? local tests and run modified examples Author: Bryan Cutler <cutlerb@gmail.com> Closes #14081 from BryanCutler/examples-cleanup-SPARK-16403.
Diffstat (limited to 'examples/src/main/java')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java4
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java113
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java93
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java2
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java19
5 files changed, 16 insertions, 215 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
index 9a43189c91..4ccd8f6ce2 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPipelineExample.java
@@ -60,7 +60,7 @@ public class JavaPipelineExample {
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
- .setRegParam(0.01);
+ .setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
@@ -71,7 +71,7 @@ public class JavaPipelineExample {
Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new JavaDocument(4L, "spark i j k"),
new JavaDocument(5L, "l m n"),
- new JavaDocument(6L, "mapreduce spark"),
+ new JavaDocument(6L, "spark hadoop spark"),
new JavaDocument(7L, "apache hadoop")
), JavaDocument.class);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
deleted file mode 100644
index ca80d0d8bb..0000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.ml.param.ParamMap;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-
-/**
- * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
- * Run with
- * {{{
- * bin/run-example ml.JavaSimpleParamsExample
- * }}}
- */
-public class JavaSimpleParamsExample {
-
- public static void main(String[] args) {
- SparkSession spark = SparkSession
- .builder()
- .appName("JavaSimpleParamsExample")
- .getOrCreate();
-
- // Prepare training data.
- // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
- // into DataFrames, where it uses the bean metadata to infer the schema.
- List<LabeledPoint> localTraining = Lists.newArrayList(
- new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
- new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
- new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
- new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5)));
- Dataset<Row> training =
- spark.createDataFrame(localTraining, LabeledPoint.class);
-
- // Create a LogisticRegression instance. This instance is an Estimator.
- LogisticRegression lr = new LogisticRegression();
- // Print out the parameters, documentation, and any default values.
- System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
-
- // We may set parameters using setter methods.
- lr.setMaxIter(10)
- .setRegParam(0.01);
-
- // Learn a LogisticRegression model. This uses the parameters stored in lr.
- LogisticRegressionModel model1 = lr.fit(training);
- // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
- // we can view the parameters it used during fit().
- // This prints the parameter (name: value) pairs, where names are unique IDs for this
- // LogisticRegression instance.
- System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
-
- // We may alternatively specify parameters using a ParamMap.
- ParamMap paramMap = new ParamMap();
- paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
- paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
- double[] thresholds = {0.5, 0.5};
- paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params.
-
- // One can also combine ParamMaps.
- ParamMap paramMap2 = new ParamMap();
- paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name.
- ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
-
- // Now learn a new model using the paramMapCombined parameters.
- // paramMapCombined overrides all parameters set earlier via lr.set* methods.
- LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
- System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
-
- // Prepare test documents.
- List<LabeledPoint> localTest = Lists.newArrayList(
- new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
- new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
- new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5)));
- Dataset<Row> test = spark.createDataFrame(localTest, LabeledPoint.class);
-
- // Make predictions on test documents using the Transformer.transform() method.
- // LogisticRegressionModel.transform will only use the 'features' column.
- // Note that model2.transform() outputs a 'myProbability' column instead of the usual
- // 'probability' column since we renamed the lr.probabilityCol parameter previously.
- Dataset<Row> results = model2.transform(test);
- Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
- for (Row r: rows.collectAsList()) {
- System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
- + ", prediction=" + r.get(3));
- }
-
- spark.stop();
- }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
deleted file mode 100644
index 7c24c46d2e..0000000000
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.examples.ml;
-
-import java.util.List;
-
-import com.google.common.collect.Lists;
-
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.feature.HashingTF;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-
-/**
- * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
- * bean classes {@link LabeledDocument} and {@link Document} defined in the Scala counterpart of
- * this example {@link SimpleTextClassificationPipeline}. Run with
- * <pre>
- * bin/run-example ml.JavaSimpleTextClassificationPipeline
- * </pre>
- */
-public class JavaSimpleTextClassificationPipeline {
-
- public static void main(String[] args) {
- SparkSession spark = SparkSession
- .builder()
- .appName("JavaSimpleTextClassificationPipeline")
- .getOrCreate();
-
- // Prepare training documents, which are labeled.
- List<LabeledDocument> localTraining = Lists.newArrayList(
- new LabeledDocument(0L, "a b c d e spark", 1.0),
- new LabeledDocument(1L, "b d", 0.0),
- new LabeledDocument(2L, "spark f g h", 1.0),
- new LabeledDocument(3L, "hadoop mapreduce", 0.0));
- Dataset<Row> training =
- spark.createDataFrame(localTraining, LabeledDocument.class);
-
- // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
- Tokenizer tokenizer = new Tokenizer()
- .setInputCol("text")
- .setOutputCol("words");
- HashingTF hashingTF = new HashingTF()
- .setNumFeatures(1000)
- .setInputCol(tokenizer.getOutputCol())
- .setOutputCol("features");
- LogisticRegression lr = new LogisticRegression()
- .setMaxIter(10)
- .setRegParam(0.001);
- Pipeline pipeline = new Pipeline()
- .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
-
- // Fit the pipeline to training documents.
- PipelineModel model = pipeline.fit(training);
-
- // Prepare test documents, which are unlabeled.
- List<Document> localTest = Lists.newArrayList(
- new Document(4L, "spark i j k"),
- new Document(5L, "l m n"),
- new Document(6L, "spark hadoop spark"),
- new Document(7L, "apache hadoop"));
- Dataset<Row> test = spark.createDataFrame(localTest, Document.class);
-
- // Make predictions on test documents.
- Dataset<Row> predictions = model.transform(test);
- for (Row r: predictions.select("id", "text", "probability", "prediction").collectAsList()) {
- System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
- + ", prediction=" + r.get(3));
- }
-
- spark.stop();
- }
-}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
index def5994429..278cce0842 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -47,7 +47,7 @@ public class JavaStopWordsRemoverExample {
.setOutputCol("filtered");
List<Row> data = Arrays.asList(
- RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
);
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
index 1cc16bb60d..a206cef4c2 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -57,17 +57,24 @@ public class JavaTokenizerExample {
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
- Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame);
- for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) {
+ RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+
+ Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
+ for (Row r : tokenized.select("words", "label").takeAsList(3)) {
java.util.List<String> words = r.getList(0);
for (String word : words) System.out.print(word + " ");
System.out.println();
}
- RegexTokenizer regexTokenizer = new RegexTokenizer()
- .setInputCol("sentence")
- .setOutputCol("words")
- .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
+ for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
+ java.util.List<String> words = r.getList(0);
+ for (String word : words) System.out.print(word + " ");
+ System.out.println();
+ }
// $example off$
spark.stop();
}