diff options
author | Xiangrui Meng <meng@databricks.com> | 2014-05-19 21:29:33 -0700 |
---|---|---|
committer | Tathagata Das <tathagata.das1565@gmail.com> | 2014-05-19 21:29:33 -0700 |
commit | bcb9dce6f444a977c714117811bce0c54b417650 (patch) | |
tree | d12d9ba87dcf5d85edc38f082adbb6fae2a19052 /examples | |
parent | b0ce22e071da4cc62ec5e29abf7b1299b8e4a6b0 (diff) | |
download | spark-bcb9dce6f444a977c714117811bce0c54b417650.tar.gz spark-bcb9dce6f444a977c714117811bce0c54b417650.tar.bz2 spark-bcb9dce6f444a977c714117811bce0c54b417650.zip |
[SPARK-1874][MLLIB] Clean up MLlib sample data
1. Added synthetic datasets for `MovieLensALS`, `LinearRegression`, `BinaryClassification`.
2. Embedded instructions in the help message of those example apps.
Per discussion with Matei on the JIRA page, new example data is under `data/mllib`.
Author: Xiangrui Meng <meng@databricks.com>
Closes #833 from mengxr/mllib-sample-data and squashes the following commits:
59f0a18 [Xiangrui Meng] add sample binary classification data
3c2f92f [Xiangrui Meng] add linear regression data
050f1ca [Xiangrui Meng] add a sample dataset for MovieLensALS example
Diffstat (limited to 'examples')
3 files changed, 36 insertions, 2 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala index 4001908c98..56b02b65d8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala @@ -29,8 +29,9 @@ import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater} /** * An example app for binary classification. Run with * {{{ - * ./bin/run-example org.apache.spark.examples.mllib.BinaryClassification + * bin/run-example org.apache.spark.examples.mllib.BinaryClassification * }}} + * A synthetic dataset is located at `data/mllib/sample_binary_classification_data.txt`. * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ object BinaryClassification { @@ -81,6 +82,15 @@ object BinaryClassification { .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) + note( + """ + |For example, the following command runs this app on a synthetic dataset: + | + | bin/spark-submit --class org.apache.spark.examples.mllib.BinaryClassification \ + | examples/target/scala-*/spark-examples-*.jar \ + | --algorithm LR --regType L2 --regParam 1.0 \ + | data/mllib/sample_binary_classification_data.txt + """.stripMargin) } parser.parse(args, defaultParams).map { params => diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala index 658d370f86..4811bb70e4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala @@ -28,8 +28,9 @@ import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1U /** * An example app for linear regression. Run with * {{{ - * ./bin/run-example org.apache.spark.examples.mllib.LinearRegression + * bin/run-example org.apache.spark.examples.mllib.LinearRegression * }}} + * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt`. * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ object LinearRegression extends App { @@ -68,6 +69,14 @@ object LinearRegression extends App { .required() .text("input paths to labeled examples in LIBSVM format") .action((x, c) => c.copy(input = x)) + note( + """ + |For example, the following command runs this app on a synthetic dataset: + | + | bin/spark-submit --class org.apache.spark.examples.mllib.LinearRegression \ + | examples/target/scala-*/spark-examples-*.jar \ + | data/mllib/sample_linear_regression_data.txt + """.stripMargin) } parser.parse(args, defaultParams).map { params => diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala index 0e4447e0de..6eb41e7ba3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala @@ -29,6 +29,12 @@ import org.apache.spark.serializer.{KryoSerializer, KryoRegistrator} /** * An example app for ALS on MovieLens data (http://grouplens.org/datasets/movielens/). + * Run with + * {{{ + * bin/run-example org.apache.spark.examples.mllib.MovieLensALS + * }}} + * A synthetic dataset in MovieLens format can be found at `data/mllib/sample_movielens_data.txt`. + * If you use it as a template to create your own app, please use `spark-submit` to submit your app. */ object MovieLensALS { @@ -70,6 +76,15 @@ object MovieLensALS { .required() .text("input paths to a MovieLens dataset of ratings") .action((x, c) => c.copy(input = x)) + note( + """ + |For example, the following command runs this app on a synthetic dataset: + | + | bin/spark-submit --class org.apache.spark.examples.mllib.MovieLensALS \ + | examples/target/scala-*/spark-examples-*.jar \ + | --rank 5 --numIterations 20 --lambda 1.0 --kryo \ + | data/mllib/sample_movielens_data.txt + """.stripMargin) } parser.parse(args, defaultParams).map { params => |