aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorXin Ren <iamshrek@126.com>2016-10-26 13:33:23 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-10-26 13:33:23 -0700
commitdcdda19785a272969fb1e3ec18382403aaad6c91 (patch)
tree7847a7849a8cdd11c8818e60f1d26e28dc09b3ae /examples
parentfb0a8a8dd7e8985676a846684b956e2d988875c6 (diff)
downloadspark-dcdda19785a272969fb1e3ec18382403aaad6c91.tar.gz
spark-dcdda19785a272969fb1e3ec18382403aaad6c91.tar.bz2
spark-dcdda19785a272969fb1e3ec18382403aaad6c91.zip
[SPARK-14300][DOCS][MLLIB] Scala MLlib examples code merge and clean up
## What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-14300 Duplicated code found in scala/examples/mllib, below all deleted in this PR: - DenseGaussianMixture.scala - StreamingLinearRegression.scala ## delete reasons: #### delete: mllib/DenseGaussianMixture.scala - duplicate of mllib/GaussianMixtureExample #### delete: mllib/StreamingLinearRegression.scala - duplicate of mllib/StreamingLinearRegressionExample When merging and cleaning those code, be sure not disturb the previous example on and off blocks. ## How was this patch tested? Test with `SKIP_API=1 jekyll` manually to make sure that works well. Author: Xin Ren <iamshrek@126.com> Closes #12195 from keypointt/SPARK-14300.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala75
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala73
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala19
3 files changed, 19 insertions, 148 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
deleted file mode 100644
index 90b817b23e..0000000000
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.clustering.GaussianMixture
-import org.apache.spark.mllib.linalg.Vectors
-
-/**
- * An example Gaussian Mixture Model EM app. Run with
- * {{{
- * ./bin/run-example mllib.DenseGaussianMixture <input> <k> <convergenceTol>
- * }}}
- * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
- */
-object DenseGaussianMixture {
- def main(args: Array[String]): Unit = {
- if (args.length < 3) {
- println("usage: DenseGmmEM <input file> <k> <convergenceTol> [maxIterations]")
- } else {
- val maxIterations = if (args.length > 3) args(3).toInt else 100
- run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
- }
- }
-
- private def run(inputFile: String, k: Int, convergenceTol: Double, maxIterations: Int) {
- val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
- val ctx = new SparkContext(conf)
-
- val data = ctx.textFile(inputFile).map { line =>
- Vectors.dense(line.trim.split(' ').map(_.toDouble))
- }.cache()
-
- val clusters = new GaussianMixture()
- .setK(k)
- .setConvergenceTol(convergenceTol)
- .setMaxIterations(maxIterations)
- .run(data)
-
- for (i <- 0 until clusters.k) {
- println("weight=%f\nmu=%s\nsigma=\n%s\n" format
- (clusters.weights(i), clusters.gaussians(i).mu, clusters.gaussians(i).sigma))
- }
-
- println("The membership value of each vector to all mixture components (first <= 100):")
- val membership = clusters.predictSoft(data)
- membership.take(100).foreach { x =>
- print(" " + x.mkString(","))
- }
- println()
- println("Cluster labels (first <= 100):")
- val clusterLabels = clusters.predict(data)
- clusterLabels.take(100).foreach { x =>
- print(" " + x)
- }
- println()
- }
-}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
deleted file mode 100644
index e5592966f1..0000000000
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.SparkConf
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-/**
- * Train a linear regression model on one stream of data and make predictions
- * on another stream, where the data streams arrive as text files
- * into two different directories.
- *
- * The rows of the text files must be labeled data points in the form
- * `(y,[x1,x2,x3,...,xn])`
- * Where n is the number of features. n must be the same for train and test.
- *
- * Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>
- *
- * To run on your local machine using the two directories `trainingDir` and `testDir`,
- * with updates every 5 seconds, and 2 features per data point, call:
- * $ bin/run-example mllib.StreamingLinearRegression trainingDir testDir 5 2
- *
- * As you add text files to `trainingDir` the model will continuously update.
- * Anytime you add text files to `testDir`, you'll see predictions from the current model.
- *
- */
-object StreamingLinearRegression {
-
- def main(args: Array[String]) {
-
- if (args.length != 4) {
- System.err.println(
- "Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")
- System.exit(1)
- }
-
- val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
- val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
-
- val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
- val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)
-
- val model = new StreamingLinearRegressionWithSGD()
- .setInitialWeights(Vectors.zeros(args(3).toInt))
-
- model.trainOn(trainingData)
- model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
-
- ssc.start()
- ssc.awaitTermination()
-
- }
-
-}
-// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
index 0a1cd2d62d..2ba1a62e45 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
@@ -26,6 +26,25 @@ import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._
+/**
+ * Train a linear regression model on one stream of data and make predictions
+ * on another stream, where the data streams arrive as text files
+ * into two different directories.
+ *
+ * The rows of the text files must be labeled data points in the form
+ * `(y,[x1,x2,x3,...,xn])`
+ * Where n is the number of features. n must be the same for train and test.
+ *
+ * Usage: StreamingLinearRegressionExample <trainingDir> <testDir>
+ *
+ * To run on your local machine using the two directories `trainingDir` and `testDir`,
+ * with updates every 5 seconds, and 2 features per data point, call:
+ * $ bin/run-example mllib.StreamingLinearRegressionExample trainingDir testDir
+ *
+ * As you add text files to `trainingDir` the model will continuously update.
+ * Anytime you add text files to `testDir`, you'll see predictions from the current model.
+ *
+ */
object StreamingLinearRegressionExample {
def main(args: Array[String]): Unit = {