aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-guide.md
diff options
context:
space:
mode:
authorReza Zadeh <rizlar@gmail.com>2014-01-13 23:52:34 -0800
committerReza Zadeh <rizlar@gmail.com>2014-01-13 23:52:34 -0800
commit845e568fada0550e632e7381748c5a9ebbe53e16 (patch)
tree3a4fa34894df649b5ef429cd794b73cf4b3e99b1 /docs/mllib-guide.md
parentf324d5355514b1c7ae85019b476046bb64b5593e (diff)
parentfdaabdc67387524ffb84354f87985f48bd31cf60 (diff)
downloadspark-845e568fada0550e632e7381748c5a9ebbe53e16.tar.gz
spark-845e568fada0550e632e7381748c5a9ebbe53e16.tar.bz2
spark-845e568fada0550e632e7381748c5a9ebbe53e16.zip
Merge remote-tracking branch 'upstream/master' into sparsesvd
Diffstat (limited to 'docs/mllib-guide.md')
-rw-r--r--docs/mllib-guide.md19
1 files changed, 14 insertions, 5 deletions
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 21d0464852..a140ecb618 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -21,6 +21,8 @@ depends on native Fortran routines. You may need to install the
if it is not already present on your nodes. MLlib will throw a linking error if it cannot
detect these libraries automatically.
+To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer.
+
# Binary Classification
Binary classification is a supervised learning problem in which we want to
@@ -316,6 +318,13 @@ other signals), you can use the trainImplicit method to get better results.
val model = ALS.trainImplicit(ratings, 1, 20, 0.01)
{% endhighlight %}
+# Using MLLib in Java
+
+All of MLlib's methods use Java-friendly types, so you can import and call them there the same
+way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
+Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
+calling `.rdd()` on your `JavaRDD` object.
+
# Using MLLib in Python
Following examples can be tested in the PySpark shell.
@@ -330,7 +339,7 @@ from numpy import array
# Load and parse the data
data = sc.textFile("mllib/data/sample_svm_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
-model = LogisticRegressionWithSGD.train(sc, parsedData)
+model = LogisticRegressionWithSGD.train(parsedData)
# Build the model
labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
@@ -356,7 +365,7 @@ data = sc.textFile("mllib/data/ridge-data/lpsa.data")
parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')]))
# Build the model
-model = LinearRegressionWithSGD.train(sc, parsedData)
+model = LinearRegressionWithSGD.train(parsedData)
# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda point: (point.item(0),
@@ -382,7 +391,7 @@ data = sc.textFile("kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
# Build the model (cluster the data)
-clusters = KMeans.train(sc, parsedData, 2, maxIterations=10,
+clusters = KMeans.train(parsedData, 2, maxIterations=10,
runs=30, initialization_mode="random")
# Evaluate clustering by computing Within Set Sum of Squared Errors
@@ -411,7 +420,7 @@ data = sc.textFile("mllib/data/als/test.data")
ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
# Build the recommendation model using Alternating Least Squares
-model = ALS.train(sc, ratings, 1, 20)
+model = ALS.train(ratings, 1, 20)
# Evaluate the model on training data
testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
@@ -426,7 +435,7 @@ signals), you can use the trainImplicit method to get better results.
{% highlight python %}
# Build the recommendation model using Alternating Least Squares based on implicit ratings
-model = ALS.trainImplicit(sc, ratings, 1, 20)
+model = ALS.trainImplicit(ratings, 1, 20)
{% endhighlight %}