diff options
Diffstat (limited to 'examples/src/main/python/mllib/naive_bayes_example.py')
-rw-r--r-- | examples/src/main/python/mllib/naive_bayes_example.py | 13 |
1 files changed, 4 insertions, 9 deletions
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index 35724f7d6a..749353b20e 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -29,15 +29,9 @@ import shutil from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.util import MLUtils -def parseLine(line): - parts = line.split(',') - label = float(parts[0]) - features = Vectors.dense([float(x) for x in parts[1].split(' ')]) - return LabeledPoint(label, features) # $example off$ if __name__ == "__main__": @@ -45,10 +39,11 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ - data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) + # Load and parse the data file. + data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split data approximately into training (60%) and test (40%) - training, test = data.randomSplit([0.6, 0.4], seed=0) + training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) |