From dcdda19785a272969fb1e3ec18382403aaad6c91 Mon Sep 17 00:00:00 2001 From: Xin Ren Date: Wed, 26 Oct 2016 13:33:23 -0700 Subject: [SPARK-14300][DOCS][MLLIB] Scala MLlib examples code merge and clean up ## What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-14300 Duplicated code found in scala/examples/mllib, below all deleted in this PR: - DenseGaussianMixture.scala - StreamingLinearRegression.scala ## delete reasons: #### delete: mllib/DenseGaussianMixture.scala - duplicate of mllib/GaussianMixtureExample #### delete: mllib/StreamingLinearRegression.scala - duplicate of mllib/StreamingLinearRegressionExample When merging and cleaning those code, be sure not disturb the previous example on and off blocks. ## How was this patch tested? Test with `SKIP_API=1 jekyll` manually to make sure that works well. Author: Xin Ren Closes #12195 from keypointt/SPARK-14300. --- .../examples/mllib/DenseGaussianMixture.scala | 75 ---------------------- .../examples/mllib/StreamingLinearRegression.scala | 73 --------------------- .../mllib/StreamingLinearRegressionExample.scala | 19 ++++++ 3 files changed, 19 insertions(+), 148 deletions(-) delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala delete mode 100644 examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala (limited to 'examples') diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala deleted file mode 100644 index 90b817b23e..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.mllib.clustering.GaussianMixture -import org.apache.spark.mllib.linalg.Vectors - -/** - * An example Gaussian Mixture Model EM app. Run with - * {{{ - * ./bin/run-example mllib.DenseGaussianMixture - * }}} - * If you use it as a template to create your own app, please use `spark-submit` to submit your app. - */ -object DenseGaussianMixture { - def main(args: Array[String]): Unit = { - if (args.length < 3) { - println("usage: DenseGmmEM [maxIterations]") - } else { - val maxIterations = if (args.length > 3) args(3).toInt else 100 - run(args(0), args(1).toInt, args(2).toDouble, maxIterations) - } - } - - private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) { - val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example") - val ctx = new SparkContext(conf) - - val data = ctx.textFile(inputFile).map { line => - Vectors.dense(line.trim.split(' ').map(_.toDouble)) - }.cache() - - val clusters = new GaussianMixture() - .setK(k) - .setConvergenceTol(convergenceTol) - .setMaxIterations(maxIterations) - .run(data) - - for (i <- 0 until clusters.k) { - println("weight=%f\nmu=%s\nsigma=\n%s\n" format - (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma)) - } - - println("The membership value of each vector to all mixture components (first <= 100):") - val membership = clusters.predictSoft(data) - membership.take(100).foreach { x => - print(" " + x.mkString(",")) - } - println() - println("Cluster labels (first <= 100):") - val clusterLabels = clusters.predict(data) - clusterLabels.take(100).foreach { x => - print(" " + x) - } - println() - } -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala deleted file mode 100644 index e5592966f1..0000000000 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// scalastyle:off println -package org.apache.spark.examples.mllib - -import org.apache.spark.SparkConf -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} -import org.apache.spark.streaming.{Seconds, StreamingContext} - -/** - * Train a linear regression model on one stream of data and make predictions - * on another stream, where the data streams arrive as text files - * into two different directories. - * - * The rows of the text files must be labeled data points in the form - * `(y,[x1,x2,x3,...,xn])` - * Where n is the number of features. n must be the same for train and test. - * - * Usage: StreamingLinearRegression - * - * To run on your local machine using the two directories `trainingDir` and `testDir`, - * with updates every 5 seconds, and 2 features per data point, call: - * $ bin/run-example mllib.StreamingLinearRegression trainingDir testDir 5 2 - * - * As you add text files to `trainingDir` the model will continuously update. - * Anytime you add text files to `testDir`, you'll see predictions from the current model. - * - */ -object StreamingLinearRegression { - - def main(args: Array[String]) { - - if (args.length != 4) { - System.err.println( - "Usage: StreamingLinearRegression ") - System.exit(1) - } - - val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression") - val ssc = new StreamingContext(conf, Seconds(args(2).toLong)) - - val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse) - val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse) - - val model = new StreamingLinearRegressionWithSGD() - .setInitialWeights(Vectors.zeros(args(3).toInt)) - - model.trainOn(trainingData) - model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() - - ssc.start() - ssc.awaitTermination() - - } - -} -// scalastyle:on println diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala index 0a1cd2d62d..2ba1a62e45 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala @@ -26,6 +26,25 @@ import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD // $example off$ import org.apache.spark.streaming._ +/** + * Train a linear regression model on one stream of data and make predictions + * on another stream, where the data streams arrive as text files + * into two different directories. + * + * The rows of the text files must be labeled data points in the form + * `(y,[x1,x2,x3,...,xn])` + * Where n is the number of features. n must be the same for train and test. + * + * Usage: StreamingLinearRegressionExample + * + * To run on your local machine using the two directories `trainingDir` and `testDir`, + * with updates every 5 seconds, and 2 features per data point, call: + * $ bin/run-example mllib.StreamingLinearRegressionExample trainingDir testDir + * + * As you add text files to `trainingDir` the model will continuously update. + * Anytime you add text files to `testDir`, you'll see predictions from the current model. + * + */ object StreamingLinearRegressionExample { def main(args: Array[String]): Unit = { -- cgit v1.2.3