aboutsummaryrefslogtreecommitdiff
path: root/docs/ml-features.md
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2015-12-08 19:18:59 +0800
committerCheng Lian <lian@databricks.com>2015-12-08 19:18:59 +0800
commitda2012a0e152aa078bdd19a5c7f91786a2dd7016 (patch)
tree1f00975b821733925effbaf0090a40795c50d669 /docs/ml-features.md
parent037b7e76a7f8b59e031873a768d81417dd180472 (diff)
downloadspark-da2012a0e152aa078bdd19a5c7f91786a2dd7016.tar.gz
spark-da2012a0e152aa078bdd19a5c7f91786a2dd7016.tar.bz2
spark-da2012a0e152aa078bdd19a5c7f91786a2dd7016.zip
[SPARK-11551][DOC][EXAMPLE] Revert PR #10002
This reverts PR #10002, commit 78209b0ccaf3f22b5e2345dfb2b98edfdb746819. The original PR wasn't tested on Jenkins before being merged. Author: Cheng Lian <lian@databricks.com> Closes #10200 from liancheng/revert-pr-10002.
Diffstat (limited to 'docs/ml-features.md')
-rw-r--r--docs/ml-features.md1109
1 files changed, 1058 insertions, 51 deletions
diff --git a/docs/ml-features.md b/docs/ml-features.md
index f85e0d56d2..01d6abeb5b 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -170,7 +170,25 @@ Refer to the [Tokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.fea
and the [RegexTokenizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/TokenizerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
+
+val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+ (0, "Hi I heard about Spark"),
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
+)).toDF("label", "sentence")
+val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+val regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+val tokenized = tokenizer.transform(sentenceDataFrame)
+tokenized.select("words", "label").take(3).foreach(println)
+val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+regexTokenized.select("words", "label").take(3).foreach(println)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -179,7 +197,44 @@ Refer to the [Tokenizer Java docs](api/java/org/apache/spark/ml/feature/Tokenize
and the [RegexTokenizer Java docs](api/java/org/apache/spark/ml/feature/RegexTokenizer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaTokenizerExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RegexTokenizer;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "Hi I heard about Spark"),
+ RowFactory.create(1, "I wish Java could use case classes"),
+ RowFactory.create(2, "Logistic,regression,models,are,neat")
+));
+StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+});
+DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+for (Row r : wordsDataFrame.select("words", "label").take(3)) {
+ java.util.List<String> words = r.getList(0);
+ for (String word : words) System.out.print(word + " ");
+ System.out.println();
+}
+
+RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -188,7 +243,21 @@ Refer to the [Tokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.featu
the the [RegexTokenizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer)
for more details on the API.
-{% include_example python/ml/tokenizer_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import Tokenizer, RegexTokenizer
+
+sentenceDataFrame = sqlContext.createDataFrame([
+ (0, "Hi I heard about Spark"),
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
+], ["label", "sentence"])
+tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+wordsDataFrame = tokenizer.transform(sentenceDataFrame)
+for words_label in wordsDataFrame.select("words", "label").take(3):
+ print(words_label)
+regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
+# alternatively, pattern="\\w+", gaps(False)
+{% endhighlight %}
</div>
</div>
@@ -237,7 +306,19 @@ filtered out.
Refer to the [StopWordsRemover Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.StopWordsRemover
+
+val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+val dataSet = sqlContext.createDataFrame(Seq(
+ (0, Seq("I", "saw", "the", "red", "baloon")),
+ (1, Seq("Mary", "had", "a", "little", "lamb"))
+)).toDF("id", "raw")
+
+remover.transform(dataSet).show()
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -245,7 +326,34 @@ for more details on the API.
Refer to the [StopWordsRemover Java docs](api/java/org/apache/spark/ml/feature/StopWordsRemover.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+StopWordsRemover remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered");
+
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+});
+DataFrame dataset = jsql.createDataFrame(rdd, schema);
+
+remover.transform(dataset).show();
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -253,7 +361,17 @@ for more details on the API.
Refer to the [StopWordsRemover Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover)
for more details on the API.
-{% include_example python/ml/stopwords_remover_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import StopWordsRemover
+
+sentenceData = sqlContext.createDataFrame([
+ (0, ["I", "saw", "the", "red", "baloon"]),
+ (1, ["Mary", "had", "a", "little", "lamb"])
+], ["label", "raw"])
+
+remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
+remover.transform(sentenceData).show(truncate=False)
+{% endhighlight %}
</div>
</div>
@@ -270,7 +388,19 @@ An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (t
Refer to the [NGram Scala docs](api/scala/index.html#org.apache.spark.ml.feature.NGram)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/NGramExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.NGram
+
+val wordDataFrame = sqlContext.createDataFrame(Seq(
+ (0, Array("Hi", "I", "heard", "about", "Spark")),
+ (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+ (2, Array("Logistic", "regression", "models", "are", "neat"))
+)).toDF("label", "words")
+
+val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+val ngramDataFrame = ngram.transform(wordDataFrame)
+ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -278,7 +408,38 @@ for more details on the API.
Refer to the [NGram Java docs](api/java/org/apache/spark/ml/feature/NGram.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaNGramExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.NGram;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
+ RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
+ RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
+));
+StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+});
+DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
+for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
+ java.util.List<String> ngrams = r.getList(0);
+ for (String ngram : ngrams) System.out.print(ngram + " --- ");
+ System.out.println();
+}
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -286,7 +447,19 @@ for more details on the API.
Refer to the [NGram Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.NGram)
for more details on the API.
-{% include_example python/ml/n_gram_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import NGram
+
+wordDataFrame = sqlContext.createDataFrame([
+ (0, ["Hi", "I", "heard", "about", "Spark"]),
+ (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
+ (2, ["Logistic", "regression", "models", "are", "neat"])
+], ["label", "words"])
+ngram = NGram(inputCol="words", outputCol="ngrams")
+ngramDataFrame = ngram.transform(wordDataFrame)
+for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
+ print(ngrams_label)
+{% endhighlight %}
</div>
</div>
@@ -303,7 +476,26 @@ Binarization is the process of thresholding numerical features to binary (0/1) f
Refer to the [Binarizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Binarizer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/BinarizerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.Binarizer
+import org.apache.spark.sql.DataFrame
+
+val data = Array(
+ (0, 0.1),
+ (1, 0.8),
+ (2, 0.2)
+)
+val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+
+val binarizer: Binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5)
+
+val binarizedDataFrame = binarizer.transform(dataFrame)
+val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+binarizedFeatures.collect().foreach(println)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -311,7 +503,40 @@ for more details on the API.
Refer to the [Binarizer Java docs](api/java/org/apache/spark/ml/feature/Binarizer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaBinarizerExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Binarizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, 0.1),
+ RowFactory.create(1, 0.8),
+ RowFactory.create(2, 0.2)
+));
+StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+});
+DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+Binarizer binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5);
+DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
+DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
+for (Row r : binarizedFeatures.collect()) {
+ Double binarized_value = r.getDouble(0);
+ System.out.println(binarized_value);
+}
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -319,7 +544,20 @@ for more details on the API.
Refer to the [Binarizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
for more details on the API.
-{% include_example python/ml/binarizer_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import Binarizer
+
+continuousDataFrame = sqlContext.createDataFrame([
+ (0, 0.1),
+ (1, 0.8),
+ (2, 0.2)
+], ["label", "feature"])
+binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+binarizedDataFrame = binarizer.transform(continuousDataFrame)
+binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+for binarized_feature, in binarizedFeatures.collect():
+ print(binarized_feature)
+{% endhighlight %}
</div>
</div>
@@ -333,7 +571,25 @@ for more details on the API.
Refer to the [PCA Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PCA)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/PCAExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Array(
+ Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+ Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+ Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+)
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df)
+val pcaDF = pca.transform(df)
+val result = pcaDF.select("pcaFeatures")
+result.show()
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -341,7 +597,42 @@ for more details on the API.
Refer to the [PCA Java docs](api/java/org/apache/spark/ml/feature/PCA.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaPCAExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.ml.feature.PCAModel
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaSparkContext jsc = ...
+SQLContext jsql = ...
+JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
+ RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+ RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+PCAModel pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df);
+DataFrame result = pca.transform(df).select("pcaFeatures");
+result.show();
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -349,7 +640,19 @@ for more details on the API.
Refer to the [PCA Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PCA)
for more details on the API.
-{% include_example python/ml/pca_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+ (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+ (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+df = sqlContext.createDataFrame(data,["features"])
+pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+model = pca.fit(df)
+result = model.transform(df).select("pcaFeatures")
+result.show(truncate=False)
+{% endhighlight %}
</div>
</div>
@@ -363,7 +666,23 @@ for more details on the API.
Refer to the [PolynomialExpansion Scala docs](api/scala/index.html#org.apache.spark.ml.feature.PolynomialExpansion)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.PolynomialExpansion
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Array(
+ Vectors.dense(-2.0, 2.3),
+ Vectors.dense(0.0, 0.0),
+ Vectors.dense(0.6, -1.1)
+)
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val polynomialExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3)
+val polyDF = polynomialExpansion.transform(df)
+polyDF.select("polyFeatures").take(3).foreach(println)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -371,7 +690,43 @@ for more details on the API.
Refer to the [PolynomialExpansion Java docs](api/java/org/apache/spark/ml/feature/PolynomialExpansion.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaSparkContext jsc = ...
+SQLContext jsql = ...
+PolynomialExpansion polyExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3);
+JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(-2.0, 2.3)),
+ RowFactory.create(Vectors.dense(0.0, 0.0)),
+ RowFactory.create(Vectors.dense(0.6, -1.1))
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+DataFrame polyDF = polyExpansion.transform(df);
+Row[] row = polyDF.select("polyFeatures").take(3);
+for (Row r : row) {
+ System.out.println(r.get(0));
+}
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -379,7 +734,20 @@ for more details on the API.
Refer to the [PolynomialExpansion Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.PolynomialExpansion)
for more details on the API.
-{% include_example python/ml/polynomial_expansion_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import PolynomialExpansion
+from pyspark.mllib.linalg import Vectors
+
+df = sqlContext.createDataFrame(
+ [(Vectors.dense([-2.0, 2.3]), ),
+ (Vectors.dense([0.0, 0.0]), ),
+ (Vectors.dense([0.6, -1.1]), )],
+ ["features"])
+px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
+polyDF = px.transform(df)
+for expanded in polyDF.select("polyFeatures").take(3):
+ print(expanded)
+{% endhighlight %}
</div>
</div>
@@ -403,7 +771,22 @@ $0$th DCT coefficient and _not_ the $N/2$th).
Refer to the [DCT Scala docs](api/scala/index.html#org.apache.spark.ml.feature.DCT)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/DCTExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Seq(
+ Vectors.dense(0.0, 1.0, -2.0, 3.0),
+ Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+ Vectors.dense(14.0, -2.0, -5.0, 1.0))
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false)
+val dctDf = dct.transform(df)
+dctDf.select("featuresDCT").show(3)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -411,7 +794,39 @@ for more details on the API.
Refer to the [DCT Java docs](api/java/org/apache/spark/ml/feature/DCT.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaDCTExample.java %}}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.DCT;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
+ RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
+ RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+DCT dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false);
+DataFrame dctDf = dct.transform(df);
+dctDf.select("featuresDCT").show(3);
+{% endhighlight %}
</div>
</div>
@@ -466,7 +881,18 @@ index `2`.
Refer to the [StringIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StringIndexer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/StringIndexerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.StringIndexer
+
+val df = sqlContext.createDataFrame(
+ Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+).toDF("id", "category")
+val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+val indexed = indexer.fit(df).transform(df)
+indexed.show()
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -474,7 +900,37 @@ for more details on the API.
Refer to the [StringIndexer Java docs](api/java/org/apache/spark/ml/feature/StringIndexer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaStringIndexerExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+));
+StructType schema = new StructType(new StructField[] {
+ createStructField("id", DoubleType, false),
+ createStructField("category", StringType, false)
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+StringIndexer indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex");
+DataFrame indexed = indexer.fit(df).transform(df);
+indexed.show();
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -482,7 +938,16 @@ for more details on the API.
Refer to the [StringIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer)
for more details on the API.
-{% include_example python/ml/string_indexer_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import StringIndexer
+
+df = sqlContext.createDataFrame(
+ [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
+ ["id", "category"])
+indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+indexed = indexer.fit(df).transform(df)
+indexed.show()
+{% endhighlight %}
</div>
</div>
@@ -496,7 +961,29 @@ for more details on the API.
Refer to the [OneHotEncoder Scala docs](api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+
+val df = sqlContext.createDataFrame(Seq(
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+)).toDF("id", "category")
+
+val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df)
+val indexed = indexer.transform(df)
+
+val encoder = new OneHotEncoder().setInputCol("categoryIndex").
+ setOutputCol("categoryVec")
+val encoded = encoder.transform(indexed)
+encoded.select("id", "categoryVec").foreach(println)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -504,7 +991,45 @@ for more details on the API.
Refer to the [OneHotEncoder Java docs](api/java/org/apache/spark/ml/feature/OneHotEncoder.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.OneHotEncoder;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+));
+StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("category", DataTypes.StringType, false, Metadata.empty())
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+StringIndexerModel indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df);
+DataFrame indexed = indexer.transform(df);
+
+OneHotEncoder encoder = new OneHotEncoder()
+ .setInputCol("categoryIndex")
+ .setOutputCol("categoryVec");
+DataFrame encoded = encoder.transform(indexed);
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -512,7 +1037,24 @@ for more details on the API.
Refer to the [OneHotEncoder Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
for more details on the API.
-{% include_example python/ml/onehot_encoder_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import OneHotEncoder, StringIndexer
+
+df = sqlContext.createDataFrame([
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+], ["id", "category"])
+
+stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+model = stringIndexer.fit(df)
+indexed = model.transform(df)
+encoder = OneHotEncoder(includeFirst=False, inputCol="categoryIndex", outputCol="categoryVec")
+encoded = encoder.transform(indexed)
+{% endhighlight %}
</div>
</div>
@@ -536,7 +1078,23 @@ In the example below, we read in a dataset of labeled points and then use `Vecto
Refer to the [VectorIndexer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorIndexer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/VectorIndexerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.VectorIndexer
+
+val data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+val indexer = new VectorIndexer()
+ .setInputCol("features")
+ .setOutputCol("indexed")
+ .setMaxCategories(10)
+val indexerModel = indexer.fit(data)
+val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
+println(s"Chose ${categoricalFeatures.size} categorical features: " +
+ categoricalFeatures.mkString(", "))
+
+// Create new column "indexed" with categorical values transformed to indices
+val indexedData = indexerModel.transform(data)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -544,7 +1102,30 @@ for more details on the API.
Refer to the [VectorIndexer Java docs](api/java/org/apache/spark/ml/feature/VectorIndexer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java %}
+{% highlight java %}
+import java.util.Map;
+
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.sql.DataFrame;
+
+DataFrame data = sqlContext.read().format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
+VectorIndexer indexer = new VectorIndexer()
+ .setInputCol("features")
+ .setOutputCol("indexed")
+ .setMaxCategories(10);
+VectorIndexerModel indexerModel = indexer.fit(data);
+Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
+System.out.print("Chose " + categoryMaps.size() + "categorical features:");
+for (Integer feature : categoryMaps.keySet()) {
+ System.out.print(" " + feature);
+}
+System.out.println();
+
+// Create new column "indexed" with categorical values transformed to indices
+DataFrame indexedData = indexerModel.transform(data);
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -552,7 +1133,17 @@ for more details on the API.
Refer to the [VectorIndexer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorIndexer)
for more details on the API.
-{% include_example python/ml/vector_indexer_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import VectorIndexer
+
+data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
+indexerModel = indexer.fit(data)
+
+# Create new column "indexed" with categorical values transformed to indices
+indexedData = indexerModel.transform(data)
+{% endhighlight %}
</div>
</div>
@@ -569,7 +1160,22 @@ The following example demonstrates how to load a dataset in libsvm format and th
Refer to the [Normalizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Normalizer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/NormalizerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.Normalizer
+
+val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+
+// Normalize each Vector using $L^1$ norm.
+val normalizer = new Normalizer()
+ .setInputCol("features")
+ .setOutputCol("normFeatures")
+ .setP(1.0)
+val l1NormData = normalizer.transform(dataFrame)
+
+// Normalize each Vector using $L^\infty$ norm.
+val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+{% endhighlight %}
</div>
<div data-lang="java">
@@ -577,7 +1183,24 @@ for more details on the API.
Refer to the [Normalizer Java docs](api/java/org/apache/spark/ml/feature/Normalizer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaNormalizerExample.java %}
+{% highlight java %}
+import org.apache.spark.ml.feature.Normalizer;
+import org.apache.spark.sql.DataFrame;
+
+DataFrame dataFrame = sqlContext.read().format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
+
+// Normalize each Vector using $L^1$ norm.
+Normalizer normalizer = new Normalizer()
+ .setInputCol("features")
+ .setOutputCol("normFeatures")
+ .setP(1.0);
+DataFrame l1NormData = normalizer.transform(dataFrame);
+
+// Normalize each Vector using $L^\infty$ norm.
+DataFrame lInfNormData =
+ normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
+{% endhighlight %}
</div>
<div data-lang="python">
@@ -585,7 +1208,19 @@ for more details on the API.
Refer to the [Normalizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Normalizer)
for more details on the API.
-{% include_example python/ml/normalizer_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import Normalizer
+
+dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+
+# Normalize each Vector using $L^1$ norm.
+normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
+l1NormData = normalizer.transform(dataFrame)
+
+# Normalize each Vector using $L^\infty$ norm.
+lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+{% endhighlight %}
</div>
</div>
@@ -609,7 +1244,23 @@ The following example demonstrates how to load a dataset in libsvm format and th
Refer to the [StandardScaler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.StandardScaler)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/StandardScalerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.StandardScaler
+
+val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+val scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false)
+
+// Compute summary statistics by fitting the StandardScaler
+val scalerModel = scaler.fit(dataFrame)
+
+// Normalize each feature to have unit standard deviation.
+val scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
</div>
<div data-lang="java">
@@ -617,7 +1268,25 @@ for more details on the API.
Refer to the [StandardScaler Java docs](api/java/org/apache/spark/ml/feature/StandardScaler.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaStandardScalerExample.java %}
+{% highlight java %}
+import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
+import org.apache.spark.sql.DataFrame;
+
+DataFrame dataFrame = sqlContext.read().format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
+StandardScaler scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false);
+
+// Compute summary statistics by fitting the StandardScaler
+StandardScalerModel scalerModel = scaler.fit(dataFrame);
+
+// Normalize each feature to have unit standard deviation.
+DataFrame scaledData = scalerModel.transform(dataFrame);
+{% endhighlight %}
</div>
<div data-lang="python">
@@ -625,7 +1294,20 @@ for more details on the API.
Refer to the [StandardScaler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.StandardScaler)
for more details on the API.
-{% include_example python/ml/standard_scaler_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import StandardScaler
+
+dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
+ withStd=True, withMean=False)
+
+# Compute summary statistics by fitting the StandardScaler
+scalerModel = scaler.fit(dataFrame)
+
+# Normalize each feature to have unit standard deviation.
+scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
</div>
</div>
@@ -655,7 +1337,21 @@ Refer to the [MinMaxScaler Scala docs](api/scala/index.html#org.apache.spark.ml.
and the [MinMaxScalerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.MinMaxScaler
+
+val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+val scaler = new MinMaxScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+
+// Compute summary statistics and generate MinMaxScalerModel
+val scalerModel = scaler.fit(dataFrame)
+
+// rescale each feature to range [min, max].
+val scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -664,7 +1360,24 @@ Refer to the [MinMaxScaler Java docs](api/java/org/apache/spark/ml/feature/MinMa
and the [MinMaxScalerModel Java docs](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java %}
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.sql.DataFrame;
+
+DataFrame dataFrame = sqlContext.read().format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
+MinMaxScaler scaler = new MinMaxScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures");
+
+// Compute summary statistics and generate MinMaxScalerModel
+MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
+
+// rescale each feature to range [min, max].
+DataFrame scaledData = scalerModel.transform(dataFrame);
+{% endhighlight %}
</div>
</div>
@@ -688,7 +1401,23 @@ The following example demonstrates how to bucketize a column of `Double`s into a
Refer to the [Bucketizer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/BucketizerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.Bucketizer
+import org.apache.spark.sql.DataFrame
+
+val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
+
+val data = Array(-0.5, -0.3, 0.0, 0.2)
+val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+val bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits)
+
+// Transform original data into its bucket index.
+val bucketedData = bucketizer.transform(dataFrame)
+{% endhighlight %}
</div>
<div data-lang="java">
@@ -696,7 +1425,38 @@ for more details on the API.
Refer to the [Bucketizer Java docs](api/java/org/apache/spark/ml/feature/Bucketizer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaBucketizerExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
+
+JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(-0.5),
+ RowFactory.create(-0.3),
+ RowFactory.create(0.0),
+ RowFactory.create(0.2)
+));
+StructType schema = new StructType(new StructField[] {
+ new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
+});
+DataFrame dataFrame = jsql.createDataFrame(data, schema);
+
+Bucketizer bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits);
+
+// Transform original data into its bucket index.
+DataFrame bucketedData = bucketizer.transform(dataFrame);
+{% endhighlight %}
</div>
<div data-lang="python">
@@ -704,7 +1464,19 @@ for more details on the API.
Refer to the [Bucketizer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Bucketizer)
for more details on the API.
-{% include_example python/ml/bucketizer_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import Bucketizer
+
+splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
+
+data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+dataFrame = sqlContext.createDataFrame(data, ["features"])
+
+bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
+
+# Transform original data into its bucket index.
+bucketedData = bucketizer.transform(dataFrame)
+{% endhighlight %}
</div>
</div>
@@ -736,7 +1508,25 @@ This example below demonstrates how to transform vectors using a transforming ve
Refer to the [ElementwiseProduct Scala docs](api/scala/index.html#org.apache.spark.ml.feature.ElementwiseProduct)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/ElementwiseProductExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+
+// Create some vector data; also works for sparse vectors
+val dataFrame = sqlContext.createDataFrame(Seq(
+ ("a", Vectors.dense(1.0, 2.0, 3.0)),
+ ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
+
+val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+val transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector")
+
+// Batch transform the vectors to create new column:
+transformer.transform(dataFrame).show()
+
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -744,7 +1534,41 @@ for more details on the API.
Refer to the [ElementwiseProduct Java docs](api/java/org/apache/spark/ml/feature/ElementwiseProduct.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+// Create some vector data; also works for sparse vectors
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
+ RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
+));
+List<StructField> fields = new ArrayList<StructField>(2);
+fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
+fields.add(DataTypes.createStructField("vector", DataTypes.StringType, false));
+StructType schema = DataTypes.createStructType(fields);
+DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
+Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+ElementwiseProduct transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector");
+// Batch transform the vectors to create new column:
+transformer.transform(dataFrame).show();
+
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -752,8 +1576,19 @@ for more details on the API.
Refer to the [ElementwiseProduct Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ElementwiseProduct)
for more details on the API.
-{% include_example python/ml/elementwise_product_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+df = sqlContext.createDataFrame(data, ["vector"])
+transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
+ inputCol="vector", outputCol="transformedVector")
+transformer.transform(df).show()
+
+{% endhighlight %}
</div>
+
</div>
## SQLTransformer
@@ -856,7 +1691,19 @@ output column to `features`, after transformation we should get the following Da
Refer to the [VectorAssembler Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.ml.feature.VectorAssembler
+
+val dataset = sqlContext.createDataFrame(
+ Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
+).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+val assembler = new VectorAssembler()
+ .setInputCols(Array("hour", "mobile", "userFeatures"))
+ .setOutputCol("features")
+val output = assembler.transform(dataset)
+println(output.select("features", "clicked").first())
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -864,7 +1711,36 @@ for more details on the API.
Refer to the [VectorAssembler Java docs](api/java/org/apache/spark/ml/feature/VectorAssembler.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+StructType schema = createStructType(new StructField[] {
+ createStructField("id", IntegerType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("mobile", DoubleType, false),
+ createStructField("userFeatures", new VectorUDT(), false),
+ createStructField("clicked", DoubleType, false)
+});
+Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+VectorAssembler assembler = new VectorAssembler()
+ .setInputCols(new String[] {"hour", "mobile", "userFeatures"})
+ .setOutputCol("features");
+
+DataFrame output = assembler.transform(dataset);
+System.out.println(output.select("features", "clicked").first());
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -872,7 +1748,19 @@ for more details on the API.
Refer to the [VectorAssembler Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler)
for more details on the API.
-{% include_example python/ml/vector_assembler_example.py %}
+{% highlight python %}
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml.feature import VectorAssembler
+
+dataset = sqlContext.createDataFrame(
+ [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
+ ["id", "hour", "mobile", "userFeatures", "clicked"])
+assembler = VectorAssembler(
+ inputCols=["hour", "mobile", "userFeatures"],
+ outputCol="features")
+output = assembler.transform(dataset)
+print(output.select("features", "clicked").first())
+{% endhighlight %}
</div>
</div>
@@ -1002,7 +1890,33 @@ Suppose also that we have a potential input attributes for the `userFeatures`, i
Refer to the [VectorSlicer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSlicer)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/VectorSlicerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
+import org.apache.spark.ml.feature.VectorSlicer
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+
+val data = Array(
+ Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
+ Vectors.dense(-2.0, 2.3, 0.0)
+)
+
+val defaultAttr = NumericAttribute.defaultAttr
+val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
+val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
+
+val dataRDD = sc.parallelize(data).map(Row.apply)
+val dataset = sqlContext.createDataFrame(dataRDD, StructType(attrGroup.toStructField()))
+
+val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
+
+slicer.setIndices(1).setNames("f3")
+// or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
+
+val output = slicer.transform(dataset)
+println(output.select("userFeatures", "features").first())
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -1010,7 +1924,41 @@ for more details on the API.
Refer to the [VectorSlicer Java docs](api/java/org/apache/spark/ml/feature/VectorSlicer.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+Attribute[] attrs = new Attribute[]{
+ NumericAttribute.defaultAttr().withName("f1"),
+ NumericAttribute.defaultAttr().withName("f2"),
+ NumericAttribute.defaultAttr().withName("f3")
+};
+AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+ RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+ RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+));
+
+DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+VectorSlicer vectorSlicer = new VectorSlicer()
+ .setInputCol("userFeatures").setOutputCol("features");
+
+vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+// or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
+
+DataFrame output = vectorSlicer.transform(dataset);
+
+System.out.println(output.select("userFeatures", "features").first());
+{% endhighlight %}
</div>
</div>
@@ -1047,7 +1995,21 @@ id | country | hour | clicked | features | label
Refer to the [RFormula Scala docs](api/scala/index.html#org.apache.spark.ml.feature.RFormula)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/RFormulaExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.RFormula
+
+val dataset = sqlContext.createDataFrame(Seq(
+ (7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)
+)).toDF("id", "country", "hour", "clicked")
+val formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label")
+val output = formula.fit(dataset).transform(dataset)
+output.select("features", "label").show()
+{% endhighlight %}
</div>
<div data-lang="java" markdown="1">
@@ -1055,7 +2017,38 @@ for more details on the API.
Refer to the [RFormula Java docs](api/java/org/apache/spark/ml/feature/RFormula.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaRFormulaExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+StructType schema = createStructType(new StructField[] {
+ createStructField("id", IntegerType, false),
+ createStructField("country", StringType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("clicked", DoubleType, false)
+});
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(7, "US", 18, 1.0),
+ RowFactory.create(8, "CA", 12, 0.0),
+ RowFactory.create(9, "NZ", 15, 0.0)
+));
+DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+RFormula formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label");
+
+DataFrame output = formula.fit(dataset).transform(dataset);
+output.select("features", "label").show();
+{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
@@ -1063,7 +2056,21 @@ for more details on the API.
Refer to the [RFormula Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula)
for more details on the API.
-{% include_example python/ml/rformula_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import RFormula
+
+dataset = sqlContext.createDataFrame(
+ [(7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)],
+ ["id", "country", "hour", "clicked"])
+formula = RFormula(
+ formula="clicked ~ country + hour",
+ featuresCol="features",
+ labelCol="label")
+output = formula.fit(dataset).transform(dataset)
+output.select("features", "label").show()
+{% endhighlight %}
</div>
</div>