From bcb9dce6f444a977c714117811bce0c54b417650 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Mon, 19 May 2014 21:29:33 -0700
Subject: [SPARK-1874][MLLIB] Clean up MLlib sample data

1. Added synthetic datasets for `MovieLensALS`, `LinearRegression`, `BinaryClassification`.
2. Embedded instructions in the help message of those example apps.

Per discussion with Matei on the JIRA page, new example data is under `data/mllib`.

Author: Xiangrui Meng <meng@databricks.com>

Closes #833 from mengxr/mllib-sample-data and squashes the following commits:

59f0a18 [Xiangrui Meng] add sample binary classification data
3c2f92f [Xiangrui Meng] add linear regression data
050f1ca [Xiangrui Meng] add a sample dataset for MovieLensALS example
---
 .../spark/examples/mllib/BinaryClassification.scala       | 12 +++++++++++-
 .../apache/spark/examples/mllib/LinearRegression.scala    | 11 ++++++++++-
 .../org/apache/spark/examples/mllib/MovieLensALS.scala    | 15 +++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'examples/src')

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
index 4001908c98..56b02b65d8 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassification.scala
@@ -29,8 +29,9 @@ import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater}
 /**
  * An example app for binary classification. Run with
  * {{{
- * ./bin/run-example org.apache.spark.examples.mllib.BinaryClassification
+ * bin/run-example org.apache.spark.examples.mllib.BinaryClassification
  * }}}
+ * A synthetic dataset is located at `data/mllib/sample_binary_classification_data.txt`.
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
 object BinaryClassification {
@@ -81,6 +82,15 @@ object BinaryClassification {
         .required()
         .text("input paths to labeled examples in LIBSVM format")
         .action((x, c) => c.copy(input = x))
+      note(
+        """
+          |For example, the following command runs this app on a synthetic dataset:
+          |
+          | bin/spark-submit --class org.apache.spark.examples.mllib.BinaryClassification \
+          |  examples/target/scala-*/spark-examples-*.jar \
+          |  --algorithm LR --regType L2 --regParam 1.0 \
+          |  data/mllib/sample_binary_classification_data.txt
+        """.stripMargin)
     }
 
     parser.parse(args, defaultParams).map { params =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
index 658d370f86..4811bb70e4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegression.scala
@@ -28,8 +28,9 @@ import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1U
 /**
  * An example app for linear regression. Run with
  * {{{
- * ./bin/run-example org.apache.spark.examples.mllib.LinearRegression
+ * bin/run-example org.apache.spark.examples.mllib.LinearRegression
  * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt`.
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
 object LinearRegression extends App {
@@ -68,6 +69,14 @@ object LinearRegression extends App {
       .required()
       .text("input paths to labeled examples in LIBSVM format")
       .action((x, c) => c.copy(input = x))
+    note(
+      """
+        |For example, the following command runs this app on a synthetic dataset:
+        |
+        | bin/spark-submit --class org.apache.spark.examples.mllib.LinearRegression \
+        |  examples/target/scala-*/spark-examples-*.jar \
+        |  data/mllib/sample_linear_regression_data.txt
+      """.stripMargin)
   }
 
   parser.parse(args, defaultParams).map { params =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
index 0e4447e0de..6eb41e7ba3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
@@ -29,6 +29,12 @@ import org.apache.spark.serializer.{KryoSerializer, KryoRegistrator}
 
 /**
  * An example app for ALS on MovieLens data (http://grouplens.org/datasets/movielens/).
+ * Run with
+ * {{{
+ * bin/run-example org.apache.spark.examples.mllib.MovieLensALS
+ * }}}
+ * A synthetic dataset in MovieLens format can be found at `data/mllib/sample_movielens_data.txt`.
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
 object MovieLensALS {
 
@@ -70,6 +76,15 @@ object MovieLensALS {
         .required()
         .text("input paths to a MovieLens dataset of ratings")
         .action((x, c) => c.copy(input = x))
+      note(
+        """
+          |For example, the following command runs this app on a synthetic dataset:
+          |
+          | bin/spark-submit --class org.apache.spark.examples.mllib.MovieLensALS \
+          |  examples/target/scala-*/spark-examples-*.jar \
+          |  --rank 5 --numIterations 20 --lambda 1.0 --kryo \
+          |  data/mllib/sample_movielens_data.txt
+        """.stripMargin)
     }
 
     parser.parse(args, defaultParams).map { params =>
-- 
cgit v1.2.3