diff options
Diffstat (limited to 'examples/src/main/python/ml')
6 files changed, 12 insertions, 17 deletions
diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py index 0af92050e3..8cda56dbb9 100644 --- a/examples/src/main/python/ml/decision_tree_classification_example.py +++ b/examples/src/main/python/ml/decision_tree_classification_example.py @@ -28,7 +28,6 @@ from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": @@ -36,8 +35,8 @@ if __name__ == "__main__": sqlContext = SQLContext(sc) # $example on$ - # Load and parse the data file, converting it to a DataFrame. - data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + # Load the data stored in LIBSVM format as a DataFrame. + data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py index 3857aed538..439e398947 100644 --- a/examples/src/main/python/ml/decision_tree_regression_example.py +++ b/examples/src/main/python/ml/decision_tree_regression_example.py @@ -28,7 +28,6 @@ from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator -from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": @@ -36,8 +35,8 @@ if __name__ == "__main__": sqlContext = SQLContext(sc) # $example on$ - # Load and parse the data file, converting it to a DataFrame. - data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + # Load the data stored in LIBSVM format as a DataFrame. + data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py b/examples/src/main/python/ml/gradient_boosted_trees.py index 6446f0fe5e..c3bf8aa2eb 100644 --- a/examples/src/main/python/ml/gradient_boosted_trees.py +++ b/examples/src/main/python/ml/gradient_boosted_trees.py @@ -24,7 +24,6 @@ from pyspark.ml.classification import GBTClassifier from pyspark.ml.feature import StringIndexer from pyspark.ml.regression import GBTRegressor from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics -from pyspark.mllib.util import MLUtils from pyspark.sql import Row, SQLContext """ @@ -70,8 +69,8 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonGBTExample") sqlContext = SQLContext(sc) - # Load and parse the data file into a dataframe. - df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + # Load the data stored in LIBSVM format as a DataFrame. + df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") diff --git a/examples/src/main/python/ml/logistic_regression.py b/examples/src/main/python/ml/logistic_regression.py index 55afe1b207..4cd027fdfb 100644 --- a/examples/src/main/python/ml/logistic_regression.py +++ b/examples/src/main/python/ml/logistic_regression.py @@ -23,7 +23,6 @@ from pyspark import SparkContext from pyspark.ml.classification import LogisticRegression from pyspark.mllib.evaluation import MulticlassMetrics from pyspark.ml.feature import StringIndexer -from pyspark.mllib.util import MLUtils from pyspark.sql import SQLContext """ @@ -41,8 +40,8 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonLogisticRegressionExample") sqlContext = SQLContext(sc) - # Load and parse the data file into a dataframe. - df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + # Load the data stored in LIBSVM format as a DataFrame. + df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py index d8ef9f39e3..f84588f547 100644 --- a/examples/src/main/python/ml/multilayer_perceptron_classification.py +++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py @@ -22,7 +22,6 @@ from pyspark.sql import SQLContext # $example on$ from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator -from pyspark.mllib.util import MLUtils # $example off$ if __name__ == "__main__": @@ -32,8 +31,8 @@ if __name__ == "__main__": # $example on$ # Load training data - data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")\ - .toDF() + data = sqlContext.read.format("libsvm")\ + .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] diff --git a/examples/src/main/python/ml/random_forest_example.py b/examples/src/main/python/ml/random_forest_example.py index c7730e1bfa..dc6a778670 100644 --- a/examples/src/main/python/ml/random_forest_example.py +++ b/examples/src/main/python/ml/random_forest_example.py @@ -74,8 +74,8 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonRandomForestExample") sqlContext = SQLContext(sc) - # Load and parse the data file into a dataframe. - df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() + # Load the data stored in LIBSVM format as a DataFrame. + df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") |