From bcb9dce6f444a977c714117811bce0c54b417650 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 19 May 2014 21:29:33 -0700 Subject: [SPARK-1874][MLLIB] Clean up MLlib sample data 1. Added synthetic datasets for `MovieLensALS`, `LinearRegression`, `BinaryClassification`. 2. Embedded instructions in the help message of those example apps. Per discussion with Matei on the JIRA page, new example data is under `data/mllib`. Author: Xiangrui Meng Closes #833 from mengxr/mllib-sample-data and squashes the following commits: 59f0a18 [Xiangrui Meng] add sample binary classification data 3c2f92f [Xiangrui Meng] add linear regression data 050f1ca [Xiangrui Meng] add a sample dataset for MovieLensALS example --- .../spark/examples/mllib/BinaryClassification.scala | 12 +++++++++++- .../apache/spark/examples/mllib/LinearRegression.scala | 11 ++++++++++- .../org/apache/spark/examples/mllib/MovieLensALS.scala | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) (limited to 'examples/src') diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala index 4001908c98..56b02b65d8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala @@ -29,8 +29,9 @@ import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater} /** * An example app for binary classification. Run with * {{{ - * ./bin/run-example org.apache.spark.examples.mllib.BinaryClassification + * bin/run-example org.apache.spark.examples.mllib.BinaryClassification * }}} + * A synthetic dataset is located at `data/mllib/sample_binary_classification_data.txt`. * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ object BinaryClassification { @@ -81,6 +82,15 @@ object BinaryClassification { .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) + note( + """ + |For example, the following command runs this app on a synthetic dataset: + | + | bin/spark-submit --class org.apache.spark.examples.mllib.BinaryClassification \ + | examples/target/scala-*/spark-examples-*.jar \ + | --algorithm LR --regType L2 --regParam 1.0 \ + | data/mllib/sample_binary_classification_data.txt + """.stripMargin) } parser.parse(args, defaultParams).map { params => diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala index 658d370f86..4811bb70e4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala @@ -28,8 +28,9 @@ import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1U /** * An example app for linear regression. Run with * {{{ - * ./bin/run-example org.apache.spark.examples.mllib.LinearRegression + * bin/run-example org.apache.spark.examples.mllib.LinearRegression * }}} + * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt`. * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ object LinearRegression extends App { @@ -68,6 +69,14 @@ object LinearRegression extends App { .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) + note( + """ + |For example, the following command runs this app on a synthetic dataset: + | + | bin/spark-submit --class org.apache.spark.examples.mllib.LinearRegression \ + | examples/target/scala-*/spark-examples-*.jar \ + | data/mllib/sample_linear_regression_data.txt + """.stripMargin) } parser.parse(args, defaultParams).map { params => diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala index 0e4447e0de..6eb41e7ba3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala @@ -29,6 +29,12 @@ import org.apache.spark.serializer.{KryoSerializer, KryoRegistrator} /** * An example app for ALS on MovieLens data (http://grouplens.org/datasets/movielens/). + * Run with + * {{{ + * bin/run-example org.apache.spark.examples.mllib.MovieLensALS + * }}} + * A synthetic dataset in MovieLens format can be found at `data/mllib/sample_movielens_data.txt`. + * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ object MovieLensALS { @@ -70,6 +76,15 @@ object MovieLensALS { .required() .text("input paths to a MovieLens dataset of ratings") .action((x, c) => c.copy(input = x)) + note( + """ + |For example, the following command runs this app on a synthetic dataset: + | + | bin/spark-submit --class org.apache.spark.examples.mllib.MovieLensALS \ + | examples/target/scala-*/spark-examples-*.jar \ + | --rank 5 --numIterations 20 --lambda 1.0 --kryo \ + | data/mllib/sample_movielens_data.txt + """.stripMargin) } parser.parse(args, defaultParams).map { params => -- cgit v1.2.3