[SPARK-1874][MLLIB] Clean up MLlib sample data

1. Added synthetic datasets for `MovieLensALS`, `LinearRegression`, `BinaryClassification`. 2. Embedded instructions in the help message of those example apps. Per discussion with Matei on the JIRA page, new example data is under `data/mllib`. Author: Xiangrui Meng <meng@databricks.com> Closes #833 from mengxr/mllib-sample-data and squashes the following commits: 59f0a18 [Xiangrui Meng] add sample binary classification data 3c2f92f [Xiangrui Meng] add linear regression data 050f1ca [Xiangrui Meng] add a sample dataset for MovieLensALS example
author: Xiangrui Meng <meng@databricks.com> 2014-05-19 21:29:33 -0700
committer: Tathagata Das <tathagata.das1565@gmail.com> 2014-05-19 21:29:33 -0700
commit: bcb9dce6f444a977c714117811bce0c54b417650 (patch)
tree: d12d9ba87dcf5d85edc38f082adbb6fae2a19052 /examples
parent: b0ce22e071da4cc62ec5e29abf7b1299b8e4a6b0 (diff)
download: spark-bcb9dce6f444a977c714117811bce0c54b417650.tar.gz
spark-bcb9dce6f444a977c714117811bce0c54b417650.tar.bz2
spark-bcb9dce6f444a977c714117811bce0c54b417650.zip
3 files changed, 36 insertions, 2 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
index 4001908c98..56b02b65d8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
@@ -29,8 +29,9 @@ import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater}
 /**
  * An example app for binary classification. Run with
  * {{{
- * ./bin/run-example org.apache.spark.examples.mllib.BinaryClassification
+ * bin/run-example org.apache.spark.examples.mllib.BinaryClassification
  * }}}
+ * A synthetic dataset is located at `data/mllib/sample_binary_classification_data.txt`.
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
 object BinaryClassification {
@@ -81,6 +82,15 @@ object BinaryClassification {
         .required()
         .text("input paths to labeled examples in LIBSVM format")
         .action((x, c) => c.copy(input = x))
+      note(
+        """
+          |For example, the following command runs this app on a synthetic dataset:
+          |
+          | bin/spark-submit --class org.apache.spark.examples.mllib.BinaryClassification \
+          |  examples/target/scala-*/spark-examples-*.jar \
+          |  --algorithm LR --regType L2 --regParam 1.0 \
+          |  data/mllib/sample_binary_classification_data.txt
+        """.stripMargin)
     }
 
     parser.parse(args, defaultParams).map { params =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
index 658d370f86..4811bb70e4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -28,8 +28,9 @@ import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1U
 /**
  * An example app for linear regression. Run with
  * {{{
- * ./bin/run-example org.apache.spark.examples.mllib.LinearRegression
+ * bin/run-example org.apache.spark.examples.mllib.LinearRegression
  * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt`.
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
 object LinearRegression extends App {
@@ -68,6 +69,14 @@ object LinearRegression extends App {
       .required()
       .text("input paths to labeled examples in LIBSVM format")
       .action((x, c) => c.copy(input = x))
+    note(
+      """
+        |For example, the following command runs this app on a synthetic dataset:
+        |
+        | bin/spark-submit --class org.apache.spark.examples.mllib.LinearRegression \
+        |  examples/target/scala-*/spark-examples-*.jar \
+        |  data/mllib/sample_linear_regression_data.txt
+      """.stripMargin)
   }
 
   parser.parse(args, defaultParams).map { params =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
index 0e4447e0de..6eb41e7ba3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
@@ -29,6 +29,12 @@ import org.apache.spark.serializer.{KryoSerializer, KryoRegistrator}
 
 /**
  * An example app for ALS on MovieLens data (http://grouplens.org/datasets/movielens/).
+ * Run with
+ * {{{
+ * bin/run-example org.apache.spark.examples.mllib.MovieLensALS
+ * }}}
+ * A synthetic dataset in MovieLens format can be found at `data/mllib/sample_movielens_data.txt`.
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
 object MovieLensALS {
 
@@ -70,6 +76,15 @@ object MovieLensALS {
         .required()
         .text("input paths to a MovieLens dataset of ratings")
         .action((x, c) => c.copy(input = x))
+      note(
+        """
+          |For example, the following command runs this app on a synthetic dataset:
+          |
+          | bin/spark-submit --class org.apache.spark.examples.mllib.MovieLensALS \
+          |  examples/target/scala-*/spark-examples-*.jar \
+          |  --rank 5 --numIterations 20 --lambda 1.0 --kryo \
+          |  data/mllib/sample_movielens_data.txt
+        """.stripMargin)
     }
 
     parser.parse(args, defaultParams).map { params =>
author	Xiangrui Meng <meng@databricks.com>	2014-05-19 21:29:33 -0700
committer	Tathagata Das <tathagata.das1565@gmail.com>	2014-05-19 21:29:33 -0700
commit	bcb9dce6f444a977c714117811bce0c54b417650 (patch)
tree	d12d9ba87dcf5d85edc38f082adbb6fae2a19052 /examples
parent	b0ce22e071da4cc62ec5e29abf7b1299b8e4a6b0 (diff)
download	spark-bcb9dce6f444a977c714117811bce0c54b417650.tar.gz spark-bcb9dce6f444a977c714117811bce0c54b417650.tar.bz2 spark-bcb9dce6f444a977c714117811bce0c54b417650.zip