aboutsummaryrefslogtreecommitdiff
path: root/docs/ml-features.md
diff options
context:
space:
mode:
authory-shimizu <y.shimizu0429@gmail.com>2015-09-11 08:27:30 -0700
committerXiangrui Meng <meng@databricks.com>2015-09-11 08:27:30 -0700
commitc268ca4ddde2f5213b2e3985dcaaac5900aea71c (patch)
tree5c16aace08404309354574729ff703d2a87ec822 /docs/ml-features.md
parent9bbe33f318c866c0b13088917542715062f0787f (diff)
downloadspark-c268ca4ddde2f5213b2e3985dcaaac5900aea71c.tar.gz
spark-c268ca4ddde2f5213b2e3985dcaaac5900aea71c.tar.bz2
spark-c268ca4ddde2f5213b2e3985dcaaac5900aea71c.zip
[SPARK-10518] [DOCS] Update code examples in spark.ml user guide to use LIBSVM data source instead of MLUtils
I fixed to use LIBSVM data source in the example code in spark.ml instead of MLUtils Author: y-shimizu <y.shimizu0429@gmail.com> Closes #8697 from y-shimizu/SPARK-10518.
Diffstat (limited to 'docs/ml-features.md')
-rw-r--r--docs/ml-features.md64
1 files changed, 22 insertions, 42 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 58b31a5a5c..a414c21b5c 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1179,9 +1179,9 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
<div data-lang="scala" markdown="1">
{% highlight scala %}
import org.apache.spark.ml.feature.VectorIndexer
-import org.apache.spark.mllib.util.MLUtils
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+val data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
val indexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexed")
@@ -1200,16 +1200,12 @@ val indexedData = indexerModel.transform(data)
{% highlight java %}
import java.util.Map;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
-JavaRDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(),
- "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+DataFrame data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
VectorIndexer indexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexed")
@@ -1230,9 +1226,9 @@ DataFrame indexedData = indexerModel.transform(data);
<div data-lang="python" markdown="1">
{% highlight python %}
from pyspark.ml.feature import VectorIndexer
-from pyspark.mllib.util import MLUtils
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)
@@ -1253,10 +1249,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
<div data-lang="scala">
{% highlight scala %}
import org.apache.spark.ml.feature.Normalizer
-import org.apache.spark.mllib.util.MLUtils
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
// Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer()
@@ -1272,15 +1267,11 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
<div data-lang="java">
{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.Normalizer;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
-JavaRDD<LabeledPoint> data =
- MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
// Normalize each Vector using $L^1$ norm.
Normalizer normalizer = new Normalizer()
@@ -1297,11 +1288,10 @@ DataFrame lInfNormData =
<div data-lang="python">
{% highlight python %}
-from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import Normalizer
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-dataFrame = sqlContext.createDataFrame(data)
+dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
@@ -1335,10 +1325,9 @@ The following example demonstrates how to load a dataset in libsvm format and th
<div data-lang="scala">
{% highlight scala %}
import org.apache.spark.ml.feature.StandardScaler
-import org.apache.spark.mllib.util.MLUtils
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
@@ -1355,16 +1344,12 @@ val scaledData = scalerModel.transform(dataFrame)
<div data-lang="java">
{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.StandardScaler;
import org.apache.spark.ml.feature.StandardScalerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
-JavaRDD<LabeledPoint> data =
- MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
StandardScaler scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
@@ -1381,11 +1366,10 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
<div data-lang="python">
{% highlight python %}
-from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-dataFrame = sqlContext.createDataFrame(data)
+dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
withStd=True, withMean=False)
@@ -1424,10 +1408,9 @@ More details can be found in the API docs for
[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
{% highlight scala %}
import org.apache.spark.ml.feature.MinMaxScaler
-import org.apache.spark.mllib.util.MLUtils
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-val dataFrame = sqlContext.createDataFrame(data)
+val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
val scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures")
@@ -1448,13 +1431,10 @@ More details can be found in the API docs for
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.MinMaxScaler;
import org.apache.spark.ml.feature.MinMaxScalerModel;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
-JavaRDD<LabeledPoint> data =
- MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
-DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+DataFrame dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaledFeatures");