diff options
author | Yanbo Liang <ybliang8@gmail.com> | 2015-11-13 08:43:05 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-11-13 08:43:05 -0800 |
commit | 99693fef0a30432d94556154b81872356d921c64 (patch) | |
tree | 09d76cc0ef6cae153718982a9a1ecc827ee12d5f /docs | |
parent | 61a28486ccbcdd37461419df958aea222c8b9f09 (diff) | |
download | spark-99693fef0a30432d94556154b81872356d921c64.tar.gz spark-99693fef0a30432d94556154b81872356d921c64.tar.bz2 spark-99693fef0a30432d94556154b81872356d921c64.zip |
[SPARK-11723][ML][DOC] Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame
Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame, include:
* Use libSVM data source for all example codes under examples/ml, and remove unused import.
* Use libSVM data source for user guides under ml-*** which were omitted by #8697.
* Fix bug: We should use ```sqlContext.read().format("libsvm").load(path)``` at Java side, but the API doc and user guides misuse as ```sqlContext.read.format("libsvm").load(path)```.
* Code cleanup.
mengxr
Author: Yanbo Liang <ybliang8@gmail.com>
Closes #9690 from yanboliang/spark-11723.
Diffstat (limited to 'docs')
-rw-r--r-- | docs/ml-ensembles.md | 10 | ||||
-rw-r--r-- | docs/ml-features.md | 8 | ||||
-rw-r--r-- | docs/ml-guide.md | 10 | ||||
-rw-r--r-- | docs/ml-linear-methods.md | 4 |
4 files changed, 13 insertions, 19 deletions
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md index 58f566c9b4..ce15f5e646 100644 --- a/docs/ml-ensembles.md +++ b/docs/ml-ensembles.md @@ -195,7 +195,7 @@ import org.apache.spark.ml.feature.*; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -DataFrame data = sqlContext.read.format("libsvm") +DataFrame data = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. @@ -384,7 +384,7 @@ import org.apache.spark.ml.regression.RandomForestRegressor; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -DataFrame data = sqlContext.read.format("libsvm") +DataFrame data = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. @@ -640,7 +640,7 @@ import org.apache.spark.ml.feature.*; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. @@ -830,7 +830,7 @@ import org.apache.spark.ml.regression.GBTRegressor; import org.apache.spark.sql.DataFrame; // Load and parse the data file, converting it to a DataFrame. -DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt"); +DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. @@ -1000,7 +1000,7 @@ SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext jsql = new SQLContext(jsc); -DataFrame dataFrame = sqlContext.read.format("libsvm") +DataFrame dataFrame = sqlContext.read().format("libsvm") .load("data/mllib/sample_multiclass_classification_data.txt"); DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345); diff --git a/docs/ml-features.md b/docs/ml-features.md index 142afac2f3..cd1838d6d2 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1109,7 +1109,7 @@ import org.apache.spark.ml.feature.VectorIndexer; import org.apache.spark.ml.feature.VectorIndexerModel; import org.apache.spark.sql.DataFrame; -DataFrame data = sqlContext.read.format("libsvm") +DataFrame data = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); VectorIndexer indexer = new VectorIndexer() .setInputCol("features") @@ -1187,7 +1187,7 @@ for more details on the API. import org.apache.spark.ml.feature.Normalizer; import org.apache.spark.sql.DataFrame; -DataFrame dataFrame = sqlContext.read.format("libsvm") +DataFrame dataFrame = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Normalize each Vector using $L^1$ norm. @@ -1273,7 +1273,7 @@ import org.apache.spark.ml.feature.StandardScaler; import org.apache.spark.ml.feature.StandardScalerModel; import org.apache.spark.sql.DataFrame; -DataFrame dataFrame = sqlContext.read.format("libsvm") +DataFrame dataFrame = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); StandardScaler scaler = new StandardScaler() .setInputCol("features") @@ -1366,7 +1366,7 @@ import org.apache.spark.ml.feature.MinMaxScaler; import org.apache.spark.ml.feature.MinMaxScalerModel; import org.apache.spark.sql.DataFrame; -DataFrame dataFrame = sqlContext.read.format("libsvm") +DataFrame dataFrame = sqlContext.read().format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") diff --git a/docs/ml-guide.md b/docs/ml-guide.md index c293e71d28..be18a05361 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -867,10 +867,9 @@ The `ParamMap` which produces the best evaluation metric is selected as the best import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} -import org.apache.spark.mllib.util.MLUtils // Prepare training and test data. -val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() +val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345) val lr = new LinearRegression() @@ -911,14 +910,9 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator; import org.apache.spark.ml.param.ParamMap; import org.apache.spark.ml.regression.LinearRegression; import org.apache.spark.ml.tuning.*; -import org.apache.spark.mllib.regression.LabeledPoint; -import org.apache.spark.mllib.util.MLUtils; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.DataFrame; -DataFrame data = sqlContext.createDataFrame( - MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"), - LabeledPoint.class); +DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Prepare training and test data. DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345); diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md index 16e2ee7129..85edfd3734 100644 --- a/docs/ml-linear-methods.md +++ b/docs/ml-linear-methods.md @@ -95,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample { String path = "data/mllib/sample_libsvm_data.txt"; // Load training data - DataFrame training = sqlContext.read.format("libsvm").load(path); + DataFrame training = sqlContext.read().format("libsvm").load(path); LogisticRegression lr = new LogisticRegression() .setMaxIter(10) @@ -292,7 +292,7 @@ public class LinearRegressionWithElasticNetExample { String path = "data/mllib/sample_libsvm_data.txt"; // Load training data - DataFrame training = sqlContext.read.format("libsvm").load(path); + DataFrame training = sqlContext.read().format("libsvm").load(path); LinearRegression lr = new LinearRegression() .setMaxIter(10) |