[SPARK-12247][ML][DOC] Documentation for spark.ml's ALS and collaborative filtering in general

This documents the implementation of ALS in `spark.ml` with example code in scala, java and python. Author: BenFradet <benjamin.fradet@gmail.com> Closes #10411 from BenFradet/SPARK-12247.
author: BenFradet <benjamin.fradet@gmail.com> 2016-02-16 13:03:28 +0000
committer: Sean Owen <sowen@cloudera.com> 2016-02-16 13:03:28 +0000
commit: 00c72d27bf2e3591c4068fb344fa3edf1662ad81 (patch)
tree: b32ed039fd5f4e3775622a9918173df53b943e30 /examples/src/main/python/ml
parent: 827ed1c06785692d14857bd41f1fd94a0853874a (diff)
download: spark-00c72d27bf2e3591c4068fb344fa3edf1662ad81.tar.gz
spark-00c72d27bf2e3591c4068fb344fa3edf1662ad81.tar.bz2
spark-00c72d27bf2e3591c4068fb344fa3edf1662ad81.zip
1 files changed, 57 insertions, 0 deletions
diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
new file mode 100644
index 0000000000..f61c8ab5d6
--- /dev/null
+++ b/examples/src/main/python/ml/als_example.py
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+
+# $example on$
+import math
+
+from pyspark.ml.evaluation import RegressionEvaluator
+from pyspark.ml.recommendation import ALS
+from pyspark.sql import Row
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="ALSExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    lines = sc.textFile("data/mllib/als/sample_movielens_ratings.txt")
+    parts = lines.map(lambda l: l.split("::"))
+    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
+                                         rating=float(p[2]), timestamp=long(p[3])))
+    ratings = sqlContext.createDataFrame(ratingsRDD)
+    (training, test) = ratings.randomSplit([0.8, 0.2])
+
+    # Build the recommendation model using ALS on the training data
+    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
+    model = als.fit(training)
+
+    # Evaluate the model by computing the RMSE on the test data
+    rawPredictions = model.transform(test)
+    predictions = rawPredictions\
+        .withColumn("rating", rawPredictions.rating.cast("double"))\
+        .withColumn("prediction", rawPredictions.prediction.cast("double"))
+    evaluator =\
+        RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
+    rmse = evaluator.evaluate(predictions)
+    print("Root-mean-square error = " + str(rmse))
+    # $example off$
+    sc.stop()
author	BenFradet <benjamin.fradet@gmail.com>	2016-02-16 13:03:28 +0000
committer	Sean Owen <sowen@cloudera.com>	2016-02-16 13:03:28 +0000
commit	00c72d27bf2e3591c4068fb344fa3edf1662ad81 (patch)
tree	b32ed039fd5f4e3775622a9918173df53b943e30 /examples/src/main/python/ml
parent	827ed1c06785692d14857bd41f1fd94a0853874a (diff)
download	spark-00c72d27bf2e3591c4068fb344fa3edf1662ad81.tar.gz spark-00c72d27bf2e3591c4068fb344fa3edf1662ad81.tar.bz2 spark-00c72d27bf2e3591c4068fb344fa3edf1662ad81.zip