aboutsummaryrefslogtreecommitdiff
path: root/docs/ml-features.md
diff options
context:
space:
mode:
authorsomideshmukh <somilde@us.ibm.com>2015-12-07 23:26:34 -0800
committerXiangrui Meng <meng@databricks.com>2015-12-07 23:26:34 -0800
commit78209b0ccaf3f22b5e2345dfb2b98edfdb746819 (patch)
tree557eeb37579186d77c60ff662342eefdb0d1a7aa /docs/ml-features.md
parent3e7e05f5ee763925ed60410d7de04cf36b723de1 (diff)
downloadspark-78209b0ccaf3f22b5e2345dfb2b98edfdb746819.tar.gz
spark-78209b0ccaf3f22b5e2345dfb2b98edfdb746819.tar.bz2
spark-78209b0ccaf3f22b5e2345dfb2b98edfdb746819.zip
[SPARK-11551][DOC][EXAMPLE] Replace example code in ml-features.md using include_example
Made new patch contaning only markdown examples moved to exmaple/folder. Ony three java code were not shfted since they were contaning compliation error ,these classes are 1)StandardScale 2)NormalizerExample 3)VectorIndexer Author: Xusen Yin <yinxusen@gmail.com> Author: somideshmukh <somilde@us.ibm.com> Closes #10002 from somideshmukh/SomilBranch1.33.
Diffstat (limited to 'docs/ml-features.md')
-rw-r--r--docs/ml-features.md1109
1 files changed, 51 insertions, 1058 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index b499d6ec3b..5105a948fe 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -170,25 +170,7 @@ Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.fea
and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
-
-val sentenceDataFrame = sqlContext.createDataFrame(Seq(
- (0, "Hi I heard about Spark"),
- (1, "I wish Java could use case classes"),
- (2, "Logistic,regression,models,are,neat")
-)).toDF("label", "sentence")
-val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
-val regexTokenizer = new RegexTokenizer()
- .setInputCol("sentence")
- .setOutputCol("words")
- .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
-
-val tokenized = tokenizer.transform(sentenceDataFrame)
-tokenized.select("words", "label").take(3).foreach(println)
-val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
-regexTokenized.select("words", "label").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/TokenizerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -197,44 +179,7 @@ Refer to the [Tokenizer Java docs](api/java/org/apache/spark/ml/feature/Tokenize
and the [RegexTokenizer Java docs](api/java/org/apache/spark/ml/feature/RegexTokenizer.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.RegexTokenizer;
-import org.apache.spark.ml.feature.Tokenizer;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(0, "Hi I heard about Spark"),
- RowFactory.create(1, "I wish Java could use case classes"),
- RowFactory.create(2, "Logistic,regression,models,are,neat")
-));
-StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
- new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
-Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
-DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
-for (Row r : wordsDataFrame.select("words", "label").take(3)) {
- java.util.List<String> words = r.getList(0);
- for (String word : words) System.out.print(word + " ");
- System.out.println();
-}
-
-RegexTokenizer regexTokenizer = new RegexTokenizer()
- .setInputCol("sentence")
- .setOutputCol("words")
- .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaTokenizerExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -243,21 +188,7 @@ Refer to the [Tokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.featu
the the [RegexTokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import Tokenizer, RegexTokenizer
-
-sentenceDataFrame = sqlContext.createDataFrame([
- (0, "Hi I heard about Spark"),
- (1, "I wish Java could use case classes"),
- (2, "Logistic,regression,models,are,neat")
-], ["label", "sentence"])
-tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
-wordsDataFrame = tokenizer.transform(sentenceDataFrame)
-for words_label in wordsDataFrame.select("words", "label").take(3):
- print(words_label)
-regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
-# alternatively, pattern="\\w+", gaps(False)
-{% endhighlight %}
+{% include_example python/ml/tokenizer_example.py %}
</div>
</div>
@@ -306,19 +237,7 @@ filtered out.
Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.StopWordsRemover
-
-val remover = new StopWordsRemover()
- .setInputCol("raw")
- .setOutputCol("filtered")
-val dataSet = sqlContext.createDataFrame(Seq(
- (0, Seq("I", "saw", "the", "red", "baloon")),
- (1, Seq("Mary", "had", "a", "little", "lamb"))
-)).toDF("id", "raw")
-
-remover.transform(dataSet).show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -326,34 +245,7 @@ remover.transform(dataSet).show()
Refer to the [StopWordsRemover Java docs](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.StopWordsRemover;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-StopWordsRemover remover = new StopWordsRemover()
- .setInputCol("raw")
- .setOutputCol("filtered");
-
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
- RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
-));
-StructType schema = new StructType(new StructField[] {
- new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
-});
-DataFrame dataset = jsql.createDataFrame(rdd, schema);
-
-remover.transform(dataset).show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -361,17 +253,7 @@ remover.transform(dataset).show();
Refer to the [StopWordsRemover Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import StopWordsRemover
-
-sentenceData = sqlContext.createDataFrame([
- (0, ["I", "saw", "the", "red", "baloon"]),
- (1, ["Mary", "had", "a", "little", "lamb"])
-], ["label", "raw"])
-
-remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
-remover.transform(sentenceData).show(truncate=False)
-{% endhighlight %}
+{% include_example python/ml/stopwords_remover_example.py %}
</div>
</div>
@@ -388,19 +270,7 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.NGram
-
-val wordDataFrame = sqlContext.createDataFrame(Seq(
- (0, Array("Hi", "I", "heard", "about", "Spark")),
- (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
- (2, Array("Logistic", "regression", "models", "are", "neat"))
-)).toDF("label", "words")
-
-val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
-val ngramDataFrame = ngram.transform(wordDataFrame)
-ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/NGramExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -408,38 +278,7 @@ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(pri
Refer to the [NGram Java docs](api/java/org/apache/spark/ml/feature/NGram.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.NGram;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
- RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
- RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
-));
-StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
- new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
-});
-DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
-NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
-DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
-for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
- java.util.List<String> ngrams = r.getList(0);
- for (String ngram : ngrams) System.out.print(ngram + " --- ");
- System.out.println();
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaNGramExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -447,19 +286,7 @@ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
Refer to the [NGram Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.NGram)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import NGram
-
-wordDataFrame = sqlContext.createDataFrame([
- (0, ["Hi", "I", "heard", "about", "Spark"]),
- (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
- (2, ["Logistic", "regression", "models", "are", "neat"])
-], ["label", "words"])
-ngram = NGram(inputCol="words", outputCol="ngrams")
-ngramDataFrame = ngram.transform(wordDataFrame)
-for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
- print(ngrams_label)
-{% endhighlight %}
+{% include_example python/ml/n_gram_example.py %}
</div>
</div>
@@ -476,26 +303,7 @@ Binarization is the process of thresholding numerical features to binary (0/1) f
Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.Binarizer
-import org.apache.spark.sql.DataFrame
-
-val data = Array(
- (0, 0.1),
- (1, 0.8),
- (2, 0.2)
-)
-val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
-
-val binarizer: Binarizer = new Binarizer()
- .setInputCol("feature")
- .setOutputCol("binarized_feature")
- .setThreshold(0.5)
-
-val binarizedDataFrame = binarizer.transform(dataFrame)
-val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
-binarizedFeatures.collect().foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/BinarizerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -503,40 +311,7 @@ binarizedFeatures.collect().foreach(println)
Refer to the [Binarizer Java docs](api/java/org/apache/spark/ml/feature/Binarizer.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.Binarizer;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(0, 0.1),
- RowFactory.create(1, 0.8),
- RowFactory.create(2, 0.2)
-));
-StructType schema = new StructType(new StructField[]{
- new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
- new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
-});
-DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
-Binarizer binarizer = new Binarizer()
- .setInputCol("feature")
- .setOutputCol("binarized_feature")
- .setThreshold(0.5);
-DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
-DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
-for (Row r : binarizedFeatures.collect()) {
- Double binarized_value = r.getDouble(0);
- System.out.println(binarized_value);
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaBinarizerExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -544,20 +319,7 @@ for (Row r : binarizedFeatures.collect()) {
Refer to the [Binarizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import Binarizer
-
-continuousDataFrame = sqlContext.createDataFrame([
- (0, 0.1),
- (1, 0.8),
- (2, 0.2)
-], ["label", "feature"])
-binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
-binarizedDataFrame = binarizer.transform(continuousDataFrame)
-binarizedFeatures = binarizedDataFrame.select("binarized_feature")
-for binarized_feature, in binarizedFeatures.collect():
- print(binarized_feature)
-{% endhighlight %}
+{% include_example python/ml/binarizer_example.py %}
</div>
</div>
@@ -571,25 +333,7 @@ for binarized_feature, in binarizedFeatures.collect():
Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.PCA
-import org.apache.spark.mllib.linalg.Vectors
-
-val data = Array(
- Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
- Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
- Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
-)
-val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-val pca = new PCA()
- .setInputCol("features")
- .setOutputCol("pcaFeatures")
- .setK(3)
- .fit(df)
-val pcaDF = pca.transform(df)
-val result = pcaDF.select("pcaFeatures")
-result.show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/PCAExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -597,42 +341,7 @@ result.show()
Refer to the [PCA Java docs](api/java/org/apache/spark/ml/feature/PCA.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.feature.PCA
-import org.apache.spark.ml.feature.PCAModel
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaSparkContext jsc = ...
-SQLContext jsql = ...
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
- RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
- RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
- RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
-));
-StructType schema = new StructType(new StructField[] {
- new StructField("features", new VectorUDT(), false, Metadata.empty()),
-});
-DataFrame df = jsql.createDataFrame(data, schema);
-PCAModel pca = new PCA()
- .setInputCol("features")
- .setOutputCol("pcaFeatures")
- .setK(3)
- .fit(df);
-DataFrame result = pca.transform(df).select("pcaFeatures");
-result.show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaPCAExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -640,19 +349,7 @@ result.show();
Refer to the [PCA Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PCA)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import PCA
-from pyspark.mllib.linalg import Vectors
-
-data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
- (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
- (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
-df = sqlContext.createDataFrame(data,["features"])
-pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
-model = pca.fit(df)
-result = model.transform(df).select("pcaFeatures")
-result.show(truncate=False)
-{% endhighlight %}
+{% include_example python/ml/pca_example.py %}
</div>
</div>
@@ -666,23 +363,7 @@ result.show(truncate=False)
Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.PolynomialExpansion
-import org.apache.spark.mllib.linalg.Vectors
-
-val data = Array(
- Vectors.dense(-2.0, 2.3),
- Vectors.dense(0.0, 0.0),
- Vectors.dense(0.6, -1.1)
-)
-val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-val polynomialExpansion = new PolynomialExpansion()
- .setInputCol("features")
- .setOutputCol("polyFeatures")
- .setDegree(3)
-val polyDF = polynomialExpansion.transform(df)
-polyDF.select("polyFeatures").take(3).foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -690,43 +371,7 @@ polyDF.select("polyFeatures").take(3).foreach(println)
Refer to the [PolynomialExpansion Java docs](api/java/org/apache/spark/ml/feature/PolynomialExpansion.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaSparkContext jsc = ...
-SQLContext jsql = ...
-PolynomialExpansion polyExpansion = new PolynomialExpansion()
- .setInputCol("features")
- .setOutputCol("polyFeatures")
- .setDegree(3);
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
- RowFactory.create(Vectors.dense(-2.0, 2.3)),
- RowFactory.create(Vectors.dense(0.0, 0.0)),
- RowFactory.create(Vectors.dense(0.6, -1.1))
-));
-StructType schema = new StructType(new StructField[] {
- new StructField("features", new VectorUDT(), false, Metadata.empty()),
-});
-DataFrame df = jsql.createDataFrame(data, schema);
-DataFrame polyDF = polyExpansion.transform(df);
-Row[] row = polyDF.select("polyFeatures").take(3);
-for (Row r : row) {
- System.out.println(r.get(0));
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -734,20 +379,7 @@ for (Row r : row) {
Refer to the [PolynomialExpansion Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import PolynomialExpansion
-from pyspark.mllib.linalg import Vectors
-
-df = sqlContext.createDataFrame(
- [(Vectors.dense([-2.0, 2.3]), ),
- (Vectors.dense([0.0, 0.0]), ),
- (Vectors.dense([0.6, -1.1]), )],
- ["features"])
-px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
-polyDF = px.transform(df)
-for expanded in polyDF.select("polyFeatures").take(3):
- print(expanded)
-{% endhighlight %}
+{% include_example python/ml/polynomial_expansion_example.py %}
</div>
</div>
@@ -771,22 +403,7 @@ $0$th DCT coefficient and _not_ the $N/2$th).
Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.DCT
-import org.apache.spark.mllib.linalg.Vectors
-
-val data = Seq(
- Vectors.dense(0.0, 1.0, -2.0, 3.0),
- Vectors.dense(-1.0, 2.0, 4.0, -7.0),
- Vectors.dense(14.0, -2.0, -5.0, 1.0))
-val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-val dct = new DCT()
- .setInputCol("features")
- .setOutputCol("featuresDCT")
- .setInverse(false)
-val dctDf = dct.transform(df)
-dctDf.select("featuresDCT").show(3)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/DCTExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -794,39 +411,7 @@ dctDf.select("featuresDCT").show(3)
Refer to the [DCT Java docs](api/java/org/apache/spark/ml/feature/DCT.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.feature.DCT;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
- RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
- RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
- RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
-));
-StructType schema = new StructType(new StructField[] {
- new StructField("features", new VectorUDT(), false, Metadata.empty()),
-});
-DataFrame df = jsql.createDataFrame(data, schema);
-DCT dct = new DCT()
- .setInputCol("features")
- .setOutputCol("featuresDCT")
- .setInverse(false);
-DataFrame dctDf = dct.transform(df);
-dctDf.select("featuresDCT").show(3);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaDCTExample.java %}}
</div>
</div>
@@ -881,18 +466,7 @@ index `2`.
Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.StringIndexer
-
-val df = sqlContext.createDataFrame(
- Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
-).toDF("id", "category")
-val indexer = new StringIndexer()
- .setInputCol("category")
- .setOutputCol("categoryIndex")
-val indexed = indexer.fit(df).transform(df)
-indexed.show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StringIndexerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -900,37 +474,7 @@ indexed.show()
Refer to the [StringIndexer Java docs](api/java/org/apache/spark/ml/feature/StringIndexer.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(0, "a"),
- RowFactory.create(1, "b"),
- RowFactory.create(2, "c"),
- RowFactory.create(3, "a"),
- RowFactory.create(4, "a"),
- RowFactory.create(5, "c")
-));
-StructType schema = new StructType(new StructField[] {
- createStructField("id", DoubleType, false),
- createStructField("category", StringType, false)
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-StringIndexer indexer = new StringIndexer()
- .setInputCol("category")
- .setOutputCol("categoryIndex");
-DataFrame indexed = indexer.fit(df).transform(df);
-indexed.show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStringIndexerExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -938,16 +482,7 @@ indexed.show();
Refer to the [StringIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import StringIndexer
-
-df = sqlContext.createDataFrame(
- [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
- ["id", "category"])
-indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
-indexed = indexer.fit(df).transform(df)
-indexed.show()
-{% endhighlight %}
+{% include_example python/ml/string_indexer_example.py %}
</div>
</div>
@@ -961,29 +496,7 @@ indexed.show()
Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
-
-val df = sqlContext.createDataFrame(Seq(
- (0, "a"),
- (1, "b"),
- (2, "c"),
- (3, "a"),
- (4, "a"),
- (5, "c")
-)).toDF("id", "category")
-
-val indexer = new StringIndexer()
- .setInputCol("category")
- .setOutputCol("categoryIndex")
- .fit(df)
-val indexed = indexer.transform(df)
-
-val encoder = new OneHotEncoder().setInputCol("categoryIndex").
- setOutputCol("categoryVec")
-val encoded = encoder.transform(indexed)
-encoded.select("id", "categoryVec").foreach(println)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -991,45 +504,7 @@ encoded.select("id", "categoryVec").foreach(println)
Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.OneHotEncoder;
-import org.apache.spark.ml.feature.StringIndexer;
-import org.apache.spark.ml.feature.StringIndexerModel;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(0, "a"),
- RowFactory.create(1, "b"),
- RowFactory.create(2, "c"),
- RowFactory.create(3, "a"),
- RowFactory.create(4, "a"),
- RowFactory.create(5, "c")
-));
-StructType schema = new StructType(new StructField[]{
- new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
- new StructField("category", DataTypes.StringType, false, Metadata.empty())
-});
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-StringIndexerModel indexer = new StringIndexer()
- .setInputCol("category")
- .setOutputCol("categoryIndex")
- .fit(df);
-DataFrame indexed = indexer.transform(df);
-
-OneHotEncoder encoder = new OneHotEncoder()
- .setInputCol("categoryIndex")
- .setOutputCol("categoryVec");
-DataFrame encoded = encoder.transform(indexed);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -1037,24 +512,7 @@ DataFrame encoded = encoder.transform(indexed);
Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import OneHotEncoder, StringIndexer
-
-df = sqlContext.createDataFrame([
- (0, "a"),
- (1, "b"),
- (2, "c"),
- (3, "a"),
- (4, "a"),
- (5, "c")
-], ["id", "category"])
-
-stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
-model = stringIndexer.fit(df)
-indexed = model.transform(df)
-encoder = OneHotEncoder(includeFirst=False, inputCol="categoryIndex", outputCol="categoryVec")
-encoded = encoder.transform(indexed)
-{% endhighlight %}
+{% include_example python/ml/onehot_encoder_example.py %}
</div>
</div>
@@ -1078,23 +536,7 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.VectorIndexer
-
-val data = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-val indexer = new VectorIndexer()
- .setInputCol("features")
- .setOutputCol("indexed")
- .setMaxCategories(10)
-val indexerModel = indexer.fit(data)
-val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
-println(s"Chose ${categoricalFeatures.size} categorical features: " +
- categoricalFeatures.mkString(", "))
-
-// Create new column "indexed" with categorical values transformed to indices
-val indexedData = indexerModel.transform(data)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/VectorIndexerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -1102,30 +544,7 @@ val indexedData = indexerModel.transform(data)
Refer to the [VectorIndexer Java docs](api/java/org/apache/spark/ml/feature/VectorIndexer.html)
for more details on the API.
-{% highlight java %}
-import java.util.Map;
-
-import org.apache.spark.ml.feature.VectorIndexer;
-import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame data = sqlContext.read().format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
-VectorIndexer indexer = new VectorIndexer()
- .setInputCol("features")
- .setOutputCol("indexed")
- .setMaxCategories(10);
-VectorIndexerModel indexerModel = indexer.fit(data);
-Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
-System.out.print("Chose " + categoryMaps.size() + "categorical features:");
-for (Integer feature : categoryMaps.keySet()) {
- System.out.print(" " + feature);
-}
-System.out.println();
-
-// Create new column "indexed" with categorical values transformed to indices
-DataFrame indexedData = indexerModel.transform(data);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -1133,17 +552,7 @@ DataFrame indexedData = indexerModel.transform(data);
Refer to the [VectorIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import VectorIndexer
-
-data = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
-indexerModel = indexer.fit(data)
-
-# Create new column "indexed" with categorical values transformed to indices
-indexedData = indexerModel.transform(data)
-{% endhighlight %}
+{% include_example python/ml/vector_indexer_example.py %}
</div>
</div>
@@ -1160,22 +569,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.Normalizer
-
-val dataFrame = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-
-// Normalize each Vector using $L^1$ norm.
-val normalizer = new Normalizer()
- .setInputCol("features")
- .setOutputCol("normFeatures")
- .setP(1.0)
-val l1NormData = normalizer.transform(dataFrame)
-
-// Normalize each Vector using $L^\infty$ norm.
-val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/NormalizerExample.scala %}
</div>
<div data-lang="java">
@@ -1183,24 +577,7 @@ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.Positi
Refer to the [Normalizer Java docs](api/java/org/apache/spark/ml/feature/Normalizer.html)
for more details on the API.
-{% highlight java %}
-import org.apache.spark.ml.feature.Normalizer;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
-
-// Normalize each Vector using $L^1$ norm.
-Normalizer normalizer = new Normalizer()
- .setInputCol("features")
- .setOutputCol("normFeatures")
- .setP(1.0);
-DataFrame l1NormData = normalizer.transform(dataFrame);
-
-// Normalize each Vector using $L^\infty$ norm.
-DataFrame lInfNormData =
- normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaNormalizerExample.java %}
</div>
<div data-lang="python">
@@ -1208,19 +585,7 @@ DataFrame lInfNormData =
Refer to the [Normalizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import Normalizer
-
-dataFrame = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-
-# Normalize each Vector using $L^1$ norm.
-normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
-l1NormData = normalizer.transform(dataFrame)
-
-# Normalize each Vector using $L^\infty$ norm.
-lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
-{% endhighlight %}
+{% include_example python/ml/normalizer_example.py %}
</div>
</div>
@@ -1244,23 +609,7 @@ The following example demonstrates how to load a dataset in libsvm format and th
Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.StandardScaler
-
-val dataFrame = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-val scaler = new StandardScaler()
- .setInputCol("features")
- .setOutputCol("scaledFeatures")
- .setWithStd(true)
- .setWithMean(false)
-
-// Compute summary statistics by fitting the StandardScaler
-val scalerModel = scaler.fit(dataFrame)
-
-// Normalize each feature to have unit standard deviation.
-val scaledData = scalerModel.transform(dataFrame)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/StandardScalerExample.scala %}
</div>
<div data-lang="java">
@@ -1268,25 +617,7 @@ val scaledData = scalerModel.transform(dataFrame)
Refer to the [StandardScaler Java docs](api/java/org/apache/spark/ml/feature/StandardScaler.html)
for more details on the API.
-{% highlight java %}
-import org.apache.spark.ml.feature.StandardScaler;
-import org.apache.spark.ml.feature.StandardScalerModel;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
-StandardScaler scaler = new StandardScaler()
- .setInputCol("features")
- .setOutputCol("scaledFeatures")
- .setWithStd(true)
- .setWithMean(false);
-
-// Compute summary statistics by fitting the StandardScaler
-StandardScalerModel scalerModel = scaler.fit(dataFrame);
-
-// Normalize each feature to have unit standard deviation.
-DataFrame scaledData = scalerModel.transform(dataFrame);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaStandardScalerExample.java %}
</div>
<div data-lang="python">
@@ -1294,20 +625,7 @@ DataFrame scaledData = scalerModel.transform(dataFrame);
Refer to the [StandardScaler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import StandardScaler
-
-dataFrame = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
- withStd=True, withMean=False)
-
-# Compute summary statistics by fitting the StandardScaler
-scalerModel = scaler.fit(dataFrame)
-
-# Normalize each feature to have unit standard deviation.
-scaledData = scalerModel.transform(dataFrame)
-{% endhighlight %}
+{% include_example python/ml/standard_scaler_example.py %}
</div>
</div>
@@ -1337,21 +655,7 @@ Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.
and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.MinMaxScaler
-
-val dataFrame = sqlContext.read.format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt")
-val scaler = new MinMaxScaler()
- .setInputCol("features")
- .setOutputCol("scaledFeatures")
-
-// Compute summary statistics and generate MinMaxScalerModel
-val scalerModel = scaler.fit(dataFrame)
-
-// rescale each feature to range [min, max].
-val scaledData = scalerModel.transform(dataFrame)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -1360,24 +664,7 @@ Refer to the [MinMaxScaler Java docs](api/java/org/apache/spark/ml/feature/MinMa
and the [MinMaxScalerModel Java docs](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html)
for more details on the API.
-{% highlight java %}
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.MinMaxScaler;
-import org.apache.spark.ml.feature.MinMaxScalerModel;
-import org.apache.spark.sql.DataFrame;
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
- .load("data/mllib/sample_libsvm_data.txt");
-MinMaxScaler scaler = new MinMaxScaler()
- .setInputCol("features")
- .setOutputCol("scaledFeatures");
-
-// Compute summary statistics and generate MinMaxScalerModel
-MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
-
-// rescale each feature to range [min, max].
-DataFrame scaledData = scalerModel.transform(dataFrame);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java %}
</div>
</div>
@@ -1401,23 +688,7 @@ The following example demonstrates how to bucketize a column of `Double`s into a
Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.Bucketizer
-import org.apache.spark.sql.DataFrame
-
-val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
-
-val data = Array(-0.5, -0.3, 0.0, 0.2)
-val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
-
-val bucketizer = new Bucketizer()
- .setInputCol("features")
- .setOutputCol("bucketedFeatures")
- .setSplits(splits)
-
-// Transform original data into its bucket index.
-val bucketedData = bucketizer.transform(dataFrame)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/BucketizerExample.scala %}
</div>
<div data-lang="java">
@@ -1425,38 +696,7 @@ val bucketedData = bucketizer.transform(dataFrame)
Refer to the [Bucketizer Java docs](api/java/org/apache/spark/ml/feature/Bucketizer.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
-
-JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
- RowFactory.create(-0.5),
- RowFactory.create(-0.3),
- RowFactory.create(0.0),
- RowFactory.create(0.2)
-));
-StructType schema = new StructType(new StructField[] {
- new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
-});
-DataFrame dataFrame = jsql.createDataFrame(data, schema);
-
-Bucketizer bucketizer = new Bucketizer()
- .setInputCol("features")
- .setOutputCol("bucketedFeatures")
- .setSplits(splits);
-
-// Transform original data into its bucket index.
-DataFrame bucketedData = bucketizer.transform(dataFrame);
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaBucketizerExample.java %}
</div>
<div data-lang="python">
@@ -1464,19 +704,7 @@ DataFrame bucketedData = bucketizer.transform(dataFrame);
Refer to the [Bucketizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import Bucketizer
-
-splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
-
-data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
-dataFrame = sqlContext.createDataFrame(data, ["features"])
-
-bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
-
-# Transform original data into its bucket index.
-bucketedData = bucketizer.transform(dataFrame)
-{% endhighlight %}
+{% include_example python/ml/bucketizer_example.py %}
</div>
</div>
@@ -1508,25 +736,7 @@ This example below demonstrates how to transform vectors using a transforming ve
Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.ElementwiseProduct
-import org.apache.spark.mllib.linalg.Vectors
-
-// Create some vector data; also works for sparse vectors
-val dataFrame = sqlContext.createDataFrame(Seq(
- ("a", Vectors.dense(1.0, 2.0, 3.0)),
- ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
-
-val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
-val transformer = new ElementwiseProduct()
- .setScalingVec(transformingVector)
- .setInputCol("vector")
- .setOutputCol("transformedVector")
-
-// Batch transform the vectors to create new column:
-transformer.transform(dataFrame).show()
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -1534,41 +744,7 @@ transformer.transform(dataFrame).show()
Refer to the [ElementwiseProduct Java docs](api/java/org/apache/spark/ml/feature/ElementwiseProduct.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.ElementwiseProduct;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SQLContext;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-// Create some vector data; also works for sparse vectors
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
- RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
- RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
-));
-List<StructField> fields = new ArrayList<StructField>(2);
-fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
-fields.add(DataTypes.createStructField("vector", DataTypes.StringType, false));
-StructType schema = DataTypes.createStructType(fields);
-DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
-Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
-ElementwiseProduct transformer = new ElementwiseProduct()
- .setScalingVec(transformingVector)
- .setInputCol("vector")
- .setOutputCol("transformedVector");
-// Batch transform the vectors to create new column:
-transformer.transform(dataFrame).show();
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -1576,19 +752,8 @@ transformer.transform(dataFrame).show();
Refer to the [ElementwiseProduct Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import ElementwiseProduct
-from pyspark.mllib.linalg import Vectors
-
-data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
-df = sqlContext.createDataFrame(data, ["vector"])
-transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
- inputCol="vector", outputCol="transformedVector")
-transformer.transform(df).show()
-
-{% endhighlight %}
+{% include_example python/ml/elementwise_product_example.py %}
</div>
-
</div>
## VectorAssembler
@@ -1632,19 +797,7 @@ output column to `features`, after transformation we should get the following Da
Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.ml.feature.VectorAssembler
-
-val dataset = sqlContext.createDataFrame(
- Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
-).toDF("id", "hour", "mobile", "userFeatures", "clicked")
-val assembler = new VectorAssembler()
- .setInputCols(Array("hour", "mobile", "userFeatures"))
- .setOutputCol("features")
-val output = assembler.transform(dataset)
-println(output.select("features", "clicked").first())
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -1652,36 +805,7 @@ println(output.select("features", "clicked").first())
Refer to the [VectorAssembler Java docs](api/java/org/apache/spark/ml/feature/VectorAssembler.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-StructType schema = createStructType(new StructField[] {
- createStructField("id", IntegerType, false),
- createStructField("hour", IntegerType, false),
- createStructField("mobile", DoubleType, false),
- createStructField("userFeatures", new VectorUDT(), false),
- createStructField("clicked", DoubleType, false)
-});
-Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
-DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
-
-VectorAssembler assembler = new VectorAssembler()
- .setInputCols(new String[] {"hour", "mobile", "userFeatures"})
- .setOutputCol("features");
-
-DataFrame output = assembler.transform(dataset);
-System.out.println(output.select("features", "clicked").first());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -1689,19 +813,7 @@ System.out.println(output.select("features", "clicked").first());
Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler)
for more details on the API.
-{% highlight python %}
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml.feature import VectorAssembler
-
-dataset = sqlContext.createDataFrame(
- [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
- ["id", "hour", "mobile", "userFeatures", "clicked"])
-assembler = VectorAssembler(
- inputCols=["hour", "mobile", "userFeatures"],
- outputCol="features")
-output = assembler.transform(dataset)
-print(output.select("features", "clicked").first())
-{% endhighlight %}
+{% include_example python/ml/vector_assembler_example.py %}
</div>
</div>
@@ -1831,33 +943,7 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i
Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
-import org.apache.spark.ml.feature.VectorSlicer
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-
-val data = Array(
- Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
- Vectors.dense(-2.0, 2.3, 0.0)
-)
-
-val defaultAttr = NumericAttribute.defaultAttr
-val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
-val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
-
-val dataRDD = sc.parallelize(data).map(Row.apply)
-val dataset = sqlContext.createDataFrame(dataRDD, StructType(attrGroup.toStructField()))
-
-val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
-
-slicer.setIndices(1).setNames("f3")
-// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
-
-val output = slicer.transform(dataset)
-println(output.select("userFeatures", "features").first())
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/VectorSlicerExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -1865,41 +951,7 @@ println(output.select("userFeatures", "features").first())
Refer to the [VectorSlicer Java docs](api/java/org/apache/spark/ml/feature/VectorSlicer.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-Attribute[] attrs = new Attribute[]{
- NumericAttribute.defaultAttr().withName("f1"),
- NumericAttribute.defaultAttr().withName("f2"),
- NumericAttribute.defaultAttr().withName("f3")
-};
-AttributeGroup group = new AttributeGroup("userFeatures", attrs);
-
-JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
- RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
- RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
-));
-
-DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
-
-VectorSlicer vectorSlicer = new VectorSlicer()
- .setInputCol("userFeatures").setOutputCol("features");
-
-vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
-// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
-
-DataFrame output = vectorSlicer.transform(dataset);
-
-System.out.println(output.select("userFeatures", "features").first());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java %}
</div>
</div>
@@ -1936,21 +988,7 @@ id | country | hour | clicked | features | label
Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula)
for more details on the API.
-{% highlight scala %}
-import org.apache.spark.ml.feature.RFormula
-
-val dataset = sqlContext.createDataFrame(Seq(
- (7, "US", 18, 1.0),
- (8, "CA", 12, 0.0),
- (9, "NZ", 15, 0.0)
-)).toDF("id", "country", "hour", "clicked")
-val formula = new RFormula()
- .setFormula("clicked ~ country + hour")
- .setFeaturesCol("features")
- .setLabelCol("label")
-val output = formula.fit(dataset).transform(dataset)
-output.select("features", "label").show()
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/RFormulaExample.scala %}
</div>
<div data-lang="java" markdown="1">
@@ -1958,38 +996,7 @@ output.select("features", "label").show()
Refer to the [RFormula Java docs](api/java/org/apache/spark/ml/feature/RFormula.html)
for more details on the API.
-{% highlight java %}
-import java.util.Arrays;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.RFormula;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.*;
-import static org.apache.spark.sql.types.DataTypes.*;
-
-StructType schema = createStructType(new StructField[] {
- createStructField("id", IntegerType, false),
- createStructField("country", StringType, false),
- createStructField("hour", IntegerType, false),
- createStructField("clicked", DoubleType, false)
-});
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
- RowFactory.create(7, "US", 18, 1.0),
- RowFactory.create(8, "CA", 12, 0.0),
- RowFactory.create(9, "NZ", 15, 0.0)
-));
-DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
-
-RFormula formula = new RFormula()
- .setFormula("clicked ~ country + hour")
- .setFeaturesCol("features")
- .setLabelCol("label");
-
-DataFrame output = formula.fit(dataset).transform(dataset);
-output.select("features", "label").show();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaRFormulaExample.java %}
</div>
<div data-lang="python" markdown="1">
@@ -1997,21 +1004,7 @@ output.select("features", "label").show();
Refer to the [RFormula Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula)
for more details on the API.
-{% highlight python %}
-from pyspark.ml.feature import RFormula
-
-dataset = sqlContext.createDataFrame(
- [(7, "US", 18, 1.0),
- (8, "CA", 12, 0.0),
- (9, "NZ", 15, 0.0)],
- ["id", "country", "hour", "clicked"])
-formula = RFormula(
- formula="clicked ~ country + hour",
- featuresCol="features",
- labelCol="label")
-output = formula.fit(dataset).transform(dataset)
-output.select("features", "label").show()
-{% endhighlight %}
+{% include_example python/ml/rformula_example.py %}
</div>
</div>