Add Naive Bayes to Python MLlib, and some API fixes

- Added a Python wrapper for Naive Bayes - Updated the Scala Naive Bayes to match the style of our other algorithms better and in particular make it easier to call from Java (added builder pattern, removed default value in train method) - Updated Python MLlib functions to not require a SparkContext; we can get that from the RDD the user gives - Added a toString method in LabeledPoint - Made the Python MLlib tests run as part of run-tests as well (before they could only be run individually through each file)
author: Matei Zaharia <matei@databricks.com> 2014-01-09 23:55:06 -0800
committer: Matei Zaharia <matei@databricks.com> 2014-01-11 22:30:48 -0800
commit: 9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db (patch)
tree: 82e54a0b5c7f502893c2f6bdd96aba6f04147707 /docs/mllib-guide.md
parent: 288a878999848adb130041d1e40c14bfc879cec6 (diff)
download: spark-9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db.tar.gz
spark-9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db.tar.bz2
spark-9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db.zip
1 files changed, 5 insertions, 5 deletions
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 45ee166688..c977bc4f35 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -330,7 +330,7 @@ from numpy import array
 # Load and parse the data
 data = sc.textFile("mllib/data/sample_svm_data.txt")
 parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
-model = LogisticRegressionWithSGD.train(sc, parsedData)
+model = LogisticRegressionWithSGD.train(parsedData)
 
 # Build the model
 labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
@@ -356,7 +356,7 @@ data = sc.textFile("mllib/data/ridge-data/lpsa.data")
 parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')]))
 
 # Build the model
-model = LinearRegressionWithSGD.train(sc, parsedData)
+model = LinearRegressionWithSGD.train(parsedData)
 
 # Evaluate the model on training data
 valuesAndPreds = parsedData.map(lambda point: (point.item(0),
@@ -382,7 +382,7 @@ data = sc.textFile("kmeans_data.txt")
 parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
 
 # Build the model (cluster the data)
-clusters = KMeans.train(sc, parsedData, 2, maxIterations=10,
+clusters = KMeans.train(parsedData, 2, maxIterations=10,
         runs=30, initialization_mode="random")
 
 # Evaluate clustering by computing Within Set Sum of Squared Errors
@@ -411,7 +411,7 @@ data = sc.textFile("mllib/data/als/test.data")
 ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
 
 # Build the recommendation model using Alternating Least Squares
-model = ALS.train(sc, ratings, 1, 20)
+model = ALS.train(ratings, 1, 20)
 
 # Evaluate the model on training data
 testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
@@ -426,5 +426,5 @@ signals), you can use the trainImplicit method to get better results.
 
 {% highlight python %}
 # Build the recommendation model using Alternating Least Squares based on implicit ratings
-model = ALS.trainImplicit(sc, ratings, 1, 20)
+model = ALS.trainImplicit(ratings, 1, 20)
 {% endhighlight %}
author	Matei Zaharia <matei@databricks.com>	2014-01-09 23:55:06 -0800
committer	Matei Zaharia <matei@databricks.com>	2014-01-11 22:30:48 -0800
commit	9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db (patch)
tree	82e54a0b5c7f502893c2f6bdd96aba6f04147707 /docs/mllib-guide.md
parent	288a878999848adb130041d1e40c14bfc879cec6 (diff)
download	spark-9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db.tar.gz spark-9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db.tar.bz2 spark-9a0dfdf868187fb9a2e1656e4cf5f29d952ce5db.zip