aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-collaborative-filtering.md
diff options
context:
space:
mode:
authorRishabh Bhardwaj <rbnext29@gmail.com>2015-11-09 14:27:36 -0800
committerXiangrui Meng <meng@databricks.com>2015-11-09 14:27:36 -0800
commitb7720fa45525cff6e812fa448d0841cb41f6c8a5 (patch)
tree934518fb170c41dcd2d6225f75dd0bb001476448 /docs/mllib-collaborative-filtering.md
parent51d41e4b1a3a25a3fde3a4345afcfe4766023d23 (diff)
downloadspark-b7720fa45525cff6e812fa448d0841cb41f6c8a5.tar.gz
spark-b7720fa45525cff6e812fa448d0841cb41f6c8a5.tar.bz2
spark-b7720fa45525cff6e812fa448d0841cb41f6c8a5.zip
[SPARK-11548][DOCS] Replaced example code in mllib-collaborative-filtering.md using include_example
Kindly review the changes. Author: Rishabh Bhardwaj <rbnext29@gmail.com> Closes #9519 from rishabhbhardwaj/SPARK-11337.
Diffstat (limited to 'docs/mllib-collaborative-filtering.md')
-rw-r--r--docs/mllib-collaborative-filtering.md138
1 files changed, 3 insertions, 135 deletions
diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index 1ad52123c7..7cd1b894e7 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -66,43 +66,7 @@ recommendation model by measuring the Mean Squared Error of rating prediction.
Refer to the [`ALS` Scala docs](api/scala/index.html#org.apache.spark.mllib.recommendation.ALS) for details on the API.
-{% highlight scala %}
-import org.apache.spark.mllib.recommendation.ALS
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
-import org.apache.spark.mllib.recommendation.Rating
-
-// Load and parse the data
-val data = sc.textFile("data/mllib/als/test.data")
-val ratings = data.map(_.split(',') match { case Array(user, item, rate) =>
- Rating(user.toInt, item.toInt, rate.toDouble)
- })
-
-// Build the recommendation model using ALS
-val rank = 10
-val numIterations = 10
-val model = ALS.train(ratings, rank, numIterations, 0.01)
-
-// Evaluate the model on rating data
-val usersProducts = ratings.map { case Rating(user, product, rate) =>
- (user, product)
-}
-val predictions =
- model.predict(usersProducts).map { case Rating(user, product, rate) =>
- ((user, product), rate)
- }
-val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
- ((user, product), rate)
-}.join(predictions)
-val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
- val err = (r1 - r2)
- err * err
-}.mean()
-println("Mean Squared Error = " + MSE)
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/RecommendationExample.scala %}
If the rating matrix is derived from another source of information (e.g., it is inferred from
other signals), you can use the `trainImplicit` method to get better results.
@@ -123,81 +87,7 @@ that is equivalent to the provided example in Scala is given below:
Refer to the [`ALS` Java docs](api/java/org/apache/spark/mllib/recommendation/ALS.html) for details on the API.
-{% highlight java %}
-import scala.Tuple2;
-
-import org.apache.spark.api.java.*;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.recommendation.ALS;
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
-import org.apache.spark.mllib.recommendation.Rating;
-import org.apache.spark.SparkConf;
-
-public class CollaborativeFiltering {
- public static void main(String[] args) {
- SparkConf conf = new SparkConf().setAppName("Collaborative Filtering Example");
- JavaSparkContext sc = new JavaSparkContext(conf);
-
- // Load and parse the data
- String path = "data/mllib/als/test.data";
- JavaRDD<String> data = sc.textFile(path);
- JavaRDD<Rating> ratings = data.map(
- new Function<String, Rating>() {
- public Rating call(String s) {
- String[] sarray = s.split(",");
- return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]),
- Double.parseDouble(sarray[2]));
- }
- }
- );
-
- // Build the recommendation model using ALS
- int rank = 10;
- int numIterations = 10;
- MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);
-
- // Evaluate the model on rating data
- JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
- new Function<Rating, Tuple2<Object, Object>>() {
- public Tuple2<Object, Object> call(Rating r) {
- return new Tuple2<Object, Object>(r.user(), r.product());
- }
- }
- );
- JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
- model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
- new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
- public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
- return new Tuple2<Tuple2<Integer, Integer>, Double>(
- new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
- }
- }
- ));
- JavaRDD<Tuple2<Double, Double>> ratesAndPreds =
- JavaPairRDD.fromJavaRDD(ratings.map(
- new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
- public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r){
- return new Tuple2<Tuple2<Integer, Integer>, Double>(
- new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
- }
- }
- )).join(predictions).values();
- double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
- new Function<Tuple2<Double, Double>, Object>() {
- public Object call(Tuple2<Double, Double> pair) {
- Double err = pair._1() - pair._2();
- return err * err;
- }
- }
- ).rdd()).mean();
- System.out.println("Mean Squared Error = " + MSE);
-
- // Save and load model
- model.save(sc.sc(), "myModelPath");
- MatrixFactorizationModel sameModel = MatrixFactorizationModel.load(sc.sc(), "myModelPath");
- }
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaRecommendationExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -207,29 +97,7 @@ recommendation by measuring the Mean Squared Error of rating prediction.
Refer to the [`ALS` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS) for more details on the API.
-{% highlight python %}
-from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
-
-# Load and parse the data
-data = sc.textFile("data/mllib/als/test.data")
-ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
-
-# Build the recommendation model using Alternating Least Squares
-rank = 10
-numIterations = 10
-model = ALS.train(ratings, rank, numIterations)
-
-# Evaluate the model on training data
-testdata = ratings.map(lambda p: (p[0], p[1]))
-predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
-ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
-MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
-print("Mean Squared Error = " + str(MSE))
-
-# Save and load model
-model.save(sc, "myModelPath")
-sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example python/mllib/recommendation_example.py %}
If the rating matrix is derived from other source of information (i.e., it is inferred from other
signals), you can use the trainImplicit method to get better results.