aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-11-13 08:43:05 -0800
committerXiangrui Meng <meng@databricks.com>2015-11-13 08:43:05 -0800
commit99693fef0a30432d94556154b81872356d921c64 (patch)
tree09d76cc0ef6cae153718982a9a1ecc827ee12d5f /examples
parent61a28486ccbcdd37461419df958aea222c8b9f09 (diff)
downloadspark-99693fef0a30432d94556154b81872356d921c64.tar.gz
spark-99693fef0a30432d94556154b81872356d921c64.tar.bz2
spark-99693fef0a30432d94556154b81872356d921c64.zip
[SPARK-11723][ML][DOC] Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame
Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame, include: * Use libSVM data source for all example codes under examples/ml, and remove unused import. * Use libSVM data source for user guides under ml-*** which were omitted by #8697. * Fix bug: We should use ```sqlContext.read().format("libsvm").load(path)``` at Java side, but the API doc and user guides misuse as ```sqlContext.read.format("libsvm").load(path)```. * Code cleanup. mengxr Author: Yanbo Liang <ybliang8@gmail.com> Closes #9690 from yanboliang/spark-11723.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java8
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java9
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java6
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java23
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java6
-rw-r--r--examples/src/main/python/ml/decision_tree_classification_example.py5
-rw-r--r--examples/src/main/python/ml/decision_tree_regression_example.py5
-rw-r--r--examples/src/main/python/ml/gradient_boosted_trees.py5
-rw-r--r--examples/src/main/python/ml/logistic_regression.py5
-rw-r--r--examples/src/main/python/ml/multilayer_perceptron_classification.py5
-rw-r--r--examples/src/main/python/ml/random_forest_example.py4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala6
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala42
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala7
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala4
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala8
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala17
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala2
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala4
21 files changed, 65 insertions, 110 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
index 51c1730a8a..482225e585 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
@@ -26,9 +26,6 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier;
import org.apache.spark.ml.classification.DecisionTreeClassificationModel;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$
@@ -40,9 +37,8 @@ public class JavaDecisionTreeClassificationExample {
SQLContext sqlContext = new SQLContext(jsc);
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt");
- DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+ // Load the data stored in LIBSVM format as a DataFrame.
+ DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
index a4098a4233..c7f1868dd1 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
@@ -27,9 +27,6 @@ import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$
@@ -40,9 +37,9 @@ public class JavaDecisionTreeRegressionExample {
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt");
- DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+ // Load the data stored in LIBSVM format as a DataFrame.
+ DataFrame data = sqlContext.read().format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
index f48e1339c5..84369f6681 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -21,12 +21,9 @@ package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel;
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
// $example off$
@@ -43,8 +40,7 @@ public class JavaMultilayerPerceptronClassifierExample {
// $example on$
// Load training data
String path = "data/mllib/sample_multiclass_classification_data.txt";
- JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
- DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+ DataFrame dataFrame = jsql.read().format("libsvm").load(path);
// Split the data into train and test
DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
DataFrame train = splits[0];
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
index e7f2f6f615..f0d92a56be 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -27,9 +27,7 @@ import org.apache.spark.ml.classification.OneVsRestModel;
import org.apache.spark.ml.util.MetadataUtils;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructField;
@@ -80,31 +78,30 @@ public class JavaOneVsRestExample {
OneVsRest ovr = new OneVsRest().setClassifier(classifier);
String input = params.input;
- RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
- RDD<LabeledPoint> train;
- RDD<LabeledPoint> test;
+ DataFrame inputData = jsql.read().format("libsvm").load(input);
+ DataFrame train;
+ DataFrame test;
// compute the train/ test split: if testInput is not provided use part of input
String testInput = params.testInput;
if (testInput != null) {
train = inputData;
// compute the number of features in the training set.
- int numFeatures = inputData.first().features().size();
- test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
+ int numFeatures = inputData.first().<Vector>getAs(1).size();
+ test = jsql.read().format("libsvm").option("numFeatures",
+ String.valueOf(numFeatures)).load(testInput);
} else {
double f = params.fracTest;
- RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
+ DataFrame[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
train = tmp[0];
test = tmp[1];
}
// train the multiclass model
- DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class);
- OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());
+ OneVsRestModel ovrModel = ovr.fit(train.cache());
// score the model on test data
- DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
- DataFrame predictions = ovrModel.transform(testDataFrame.cache())
+ DataFrame predictions = ovrModel.transform(test.cache())
.select("prediction", "label");
// obtain metrics
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
index 23f834ab43..d433905fc8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
@@ -23,8 +23,6 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.tuning.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
@@ -46,9 +44,7 @@ public class JavaTrainValidationSplitExample {
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);
- DataFrame data = jsql.createDataFrame(
- MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
- LabeledPoint.class);
+ DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Prepare training and test data.
DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
diff --git a/examples/src/main/python/ml/decision_tree_classification_example.py b/examples/src/main/python/ml/decision_tree_classification_example.py
index 0af92050e3..8cda56dbb9 100644
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -28,7 +28,6 @@ from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -36,8 +35,8 @@ if __name__ == "__main__":
sqlContext = SQLContext(sc)
# $example on$
- # Load and parse the data file, converting it to a DataFrame.
- data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py b/examples/src/main/python/ml/decision_tree_regression_example.py
index 3857aed538..439e398947 100644
--- a/examples/src/main/python/ml/decision_tree_regression_example.py
+++ b/examples/src/main/python/ml/decision_tree_regression_example.py
@@ -28,7 +28,6 @@ from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -36,8 +35,8 @@ if __name__ == "__main__":
sqlContext = SQLContext(sc)
# $example on$
- # Load and parse the data file, converting it to a DataFrame.
- data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py b/examples/src/main/python/ml/gradient_boosted_trees.py
index 6446f0fe5e..c3bf8aa2eb 100644
--- a/examples/src/main/python/ml/gradient_boosted_trees.py
+++ b/examples/src/main/python/ml/gradient_boosted_trees.py
@@ -24,7 +24,6 @@ from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import GBTRegressor
from pyspark.mllib.evaluation import BinaryClassificationMetrics, RegressionMetrics
-from pyspark.mllib.util import MLUtils
from pyspark.sql import Row, SQLContext
"""
@@ -70,8 +69,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonGBTExample")
sqlContext = SQLContext(sc)
- # Load and parse the data file into a dataframe.
- df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
diff --git a/examples/src/main/python/ml/logistic_regression.py b/examples/src/main/python/ml/logistic_regression.py
index 55afe1b207..4cd027fdfb 100644
--- a/examples/src/main/python/ml/logistic_regression.py
+++ b/examples/src/main/python/ml/logistic_regression.py
@@ -23,7 +23,6 @@ from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StringIndexer
-from pyspark.mllib.util import MLUtils
from pyspark.sql import SQLContext
"""
@@ -41,8 +40,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonLogisticRegressionExample")
sqlContext = SQLContext(sc)
- # Load and parse the data file into a dataframe.
- df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
diff --git a/examples/src/main/python/ml/multilayer_perceptron_classification.py b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index d8ef9f39e3..f84588f547 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -22,7 +22,6 @@ from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -32,8 +31,8 @@ if __name__ == "__main__":
# $example on$
# Load training data
- data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")\
- .toDF()
+ data = sqlContext.read.format("libsvm")\
+ .load("data/mllib/sample_multiclass_classification_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
diff --git a/examples/src/main/python/ml/random_forest_example.py b/examples/src/main/python/ml/random_forest_example.py
index c7730e1bfa..dc6a778670 100644
--- a/examples/src/main/python/ml/random_forest_example.py
+++ b/examples/src/main/python/ml/random_forest_example.py
@@ -74,8 +74,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonRandomForestExample")
sqlContext = SQLContext(sc)
- # Load and parse the data file into a dataframe.
- df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index a24a344f1b..ff8a0a90f1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -26,7 +26,6 @@ import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
// $example off$
object DecisionTreeClassificationExample {
@@ -34,10 +33,9 @@ object DecisionTreeClassificationExample {
val conf = new SparkConf().setAppName("DecisionTreeClassificationExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ // Load the data stored in LIBSVM format as a DataFrame.
+ val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
index f28671f786..c4e98dfaca 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -32,10 +32,7 @@ import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTree
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.evaluation.{RegressionMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}
@@ -138,15 +135,18 @@ object DecisionTreeExample {
/** Load a dataset from the given path, using the given format */
private[ml] def loadData(
- sc: SparkContext,
+ sqlContext: SQLContext,
path: String,
format: String,
- expectedNumFeatures: Option[Int] = None): RDD[LabeledPoint] = {
+ expectedNumFeatures: Option[Int] = None): DataFrame = {
+ import sqlContext.implicits._
+
format match {
- case "dense" => MLUtils.loadLabeledPoints(sc, path)
+ case "dense" => MLUtils.loadLabeledPoints(sqlContext.sparkContext, path).toDF()
case "libsvm" => expectedNumFeatures match {
- case Some(numFeatures) => MLUtils.loadLibSVMFile(sc, path, numFeatures)
- case None => MLUtils.loadLibSVMFile(sc, path)
+ case Some(numFeatures) => sqlContext.read.option("numFeatures", numFeatures.toString)
+ .format("libsvm").load(path)
+ case None => sqlContext.read.format("libsvm").load(path)
}
case _ => throw new IllegalArgumentException(s"Bad data format: $format")
}
@@ -169,36 +169,22 @@ object DecisionTreeExample {
algo: String,
fracTest: Double): (DataFrame, DataFrame) = {
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// Load training data
- val origExamples: RDD[LabeledPoint] = loadData(sc, input, dataFormat)
+ val origExamples: DataFrame = loadData(sqlContext, input, dataFormat)
// Load or create test set
- val splits: Array[RDD[LabeledPoint]] = if (testInput != "") {
+ val dataframes: Array[DataFrame] = if (testInput != "") {
// Load testInput.
- val numFeatures = origExamples.take(1)(0).features.size
- val origTestExamples: RDD[LabeledPoint] =
- loadData(sc, testInput, dataFormat, Some(numFeatures))
+ val numFeatures = origExamples.first().getAs[Vector](1).size
+ val origTestExamples: DataFrame =
+ loadData(sqlContext, testInput, dataFormat, Some(numFeatures))
Array(origExamples, origTestExamples)
} else {
// Split input into training, test.
origExamples.randomSplit(Array(1.0 - fracTest, fracTest), seed = 12345)
}
- // For classification, convert labels to Strings since we will index them later with
- // StringIndexer.
- def labelsToStrings(data: DataFrame): DataFrame = {
- algo.toLowerCase match {
- case "classification" =>
- data.withColumn("labelString", data("label").cast(StringType))
- case "regression" =>
- data
- case _ =>
- throw new IllegalArgumentException("Algo ${params.algo} not supported.")
- }
- }
- val dataframes = splits.map(_.toDF()).map(labelsToStrings)
val training = dataframes(0).cache()
val test = dataframes(1).cache()
@@ -230,7 +216,7 @@ object DecisionTreeExample {
val labelColName = if (algo == "classification") "indexedLabel" else "label"
if (algo == "classification") {
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol(labelColName)
stages += labelIndexer
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index 64cd986129..fc402724d2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -25,17 +25,16 @@ import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.mllib.util.MLUtils
// $example off$
object DecisionTreeRegressionExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("DecisionTreeRegressionExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ // Load the data stored in LIBSVM format as a DataFrame.
+ val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them.
// Here, we treat features with > 4 distinct values as continuous.
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
index f4a15f806e..6b0be0f34e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
@@ -153,7 +153,7 @@ object GBTExample {
val labelColName = if (algo == "classification") "indexedLabel" else "label"
if (algo == "classification") {
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol(labelColName)
stages += labelIndexer
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
index b73299fb12..50998c94de 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -131,7 +131,7 @@ object LinearRegressionExample {
println(s"Training time: $elapsedTime seconds")
// Print the weights and intercept for linear regression.
- println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+ println(s"Weights: ${lirModel.coefficients} Intercept: ${lirModel.intercept}")
println("Training data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, training, "label")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index 8e3760ddb5..a380c90662 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -125,7 +125,7 @@ object LogisticRegressionExample {
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
@@ -149,7 +149,7 @@ object LogisticRegressionExample {
val lorModel = pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
// Print the weights and intercept for logistic regression.
- println(s"Weights: ${lorModel.weights} Intercept: ${lorModel.intercept}")
+ println(s"Weights: ${lorModel.coefficients} Intercept: ${lorModel.intercept}")
println("Training data results:")
DecisionTreeExample.evaluateClassificationModel(pipelineModel, training, "indexedLabel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index 99d5f35b5a..146b83c8be 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.SQLContext
// $example on$
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
// $example off$
/**
@@ -35,12 +34,11 @@ object MultilayerPerceptronClassifierExample {
val conf = new SparkConf().setAppName("MultilayerPerceptronClassifierExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// $example on$
- // Load training data
- val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
- .toDF()
+ // Load the data stored in LIBSVM format as a DataFrame.
+ val data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_multiclass_classification_data.txt")
// Split the data into train and test
val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L)
val train = splits(0)
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index bab31f585b..8e4f1b09a2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -27,9 +27,8 @@ import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.classification.{OneVsRest, LogisticRegression}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
/**
@@ -111,24 +110,24 @@ object OneVsRestExample {
private def run(params: Params) {
val conf = new SparkConf().setAppName(s"OneVsRestExample with $params")
val sc = new SparkContext(conf)
- val inputData = MLUtils.loadLibSVMFile(sc, params.input)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val inputData = sqlContext.read.format("libsvm").load(params.input)
// compute the train/test split: if testInput is not provided use part of input.
val data = params.testInput match {
case Some(t) => {
// compute the number of features in the training set.
- val numFeatures = inputData.first().features.size
- val testData = MLUtils.loadLibSVMFile(sc, t, numFeatures)
- Array[RDD[LabeledPoint]](inputData, testData)
+ val numFeatures = inputData.first().getAs[Vector](1).size
+ val testData = sqlContext.read.option("numFeatures", numFeatures.toString)
+ .format("libsvm").load(t)
+ Array[DataFrame](inputData, testData)
}
case None => {
val f = params.fracTest
inputData.randomSplit(Array(1 - f, f), seed = 12345)
}
}
- val Array(train, test) = data.map(_.toDF().cache())
+ val Array(train, test) = data.map(_.cache())
// instantiate the base classifier
val classifier = new LogisticRegression()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
index 109178f413..7a00d99dfe 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
@@ -159,7 +159,7 @@ object RandomForestExample {
val labelColName = if (algo == "classification") "indexedLabel" else "label"
if (algo == "classification") {
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol(labelColName)
stages += labelIndexer
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
index 1abdf219b1..cd1b0e9358 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
@@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
-import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
@@ -39,10 +38,9 @@ object TrainValidationSplitExample {
val conf = new SparkConf().setAppName("TrainValidationSplitExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// Prepare training and test data.
- val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
val lr = new LinearRegression()