aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorXusen Yin <yinxusen@gmail.com>2015-12-09 12:00:48 -0800
committerXiangrui Meng <meng@databricks.com>2015-12-09 12:00:48 -0800
commit051c6a066f7b5fcc7472412144c15b50a5319bd5 (patch)
tree810e743d0bd63ec07818e57d51469238e0998564 /examples
parent1eb7c22ce72a1b82ed194a51bbcf0da9c771605a (diff)
downloadspark-051c6a066f7b5fcc7472412144c15b50a5319bd5.tar.gz
spark-051c6a066f7b5fcc7472412144c15b50a5319bd5.tar.bz2
spark-051c6a066f7b5fcc7472412144c15b50a5319bd5.zip
[SPARK-11551][DOC] Replace example code in ml-features.md using include_example
PR on behalf of somideshmukh, thanks! Author: Xusen Yin <yinxusen@gmail.com> Author: somideshmukh <somilde@us.ibm.com> Closes #10219 from yinxusen/SPARK-11551.
Diffstat (limited to 'examples')
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java68
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java71
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java65
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java75
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java51
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java71
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java54
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java78
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java71
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java71
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java69
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java54
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java65
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java66
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java75
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java67
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java61
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java73
-rw-r--r--examples/src/main/python/ml/binarizer_example.py43
-rw-r--r--examples/src/main/python/ml/bucketizer_example.py43
-rw-r--r--examples/src/main/python/ml/elementwise_product_example.py39
-rw-r--r--examples/src/main/python/ml/n_gram_example.py42
-rw-r--r--examples/src/main/python/ml/normalizer_example.py43
-rw-r--r--examples/src/main/python/ml/onehot_encoder_example.py48
-rw-r--r--examples/src/main/python/ml/pca_example.py42
-rw-r--r--examples/src/main/python/ml/polynomial_expansion_example.py43
-rw-r--r--examples/src/main/python/ml/rformula_example.py44
-rw-r--r--examples/src/main/python/ml/standard_scaler_example.py43
-rw-r--r--examples/src/main/python/ml/stopwords_remover_example.py40
-rw-r--r--examples/src/main/python/ml/string_indexer_example.py39
-rw-r--r--examples/src/main/python/ml/tokenizer_example.py44
-rw-r--r--examples/src/main/python/ml/vector_assembler_example.py42
-rw-r--r--examples/src/main/python/ml/vector_indexer_example.py40
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala48
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala52
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala54
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala52
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala50
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala47
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala52
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala58
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala53
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala51
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala49
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala52
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala48
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala48
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala54
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala49
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala54
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala58
51 files changed, 2769 insertions, 0 deletions
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
new file mode 100644
index 0000000000..9698cac504
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizerExample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Binarizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaBinarizerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaBinarizerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, 0.1),
+ RowFactory.create(1, 0.8),
+ RowFactory.create(2, 0.2)
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+ });
+ DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+ Binarizer binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5);
+ DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
+ DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
+ for (Row r : binarizedFeatures.collect()) {
+ Double binarized_value = r.getDouble(0);
+ System.out.println(binarized_value);
+ }
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
new file mode 100644
index 0000000000..8ad369cc93
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizerExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.Bucketizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaBucketizerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaBucketizerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
+
+ JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(-0.5),
+ RowFactory.create(-0.3),
+ RowFactory.create(0.0),
+ RowFactory.create(0.2)
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
+ });
+ DataFrame dataFrame = jsql.createDataFrame(data, schema);
+
+ Bucketizer bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits);
+
+ // Transform original data into its bucket index.
+ DataFrame bucketedData = bucketizer.transform(dataFrame);
+ bucketedData.show();
+ // $example off$
+ jsc.stop();
+ }
+}
+
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
new file mode 100644
index 0000000000..35c0d534a4
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCTExample.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.DCT;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaDCTExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaDCTExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
+ RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
+ RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+ DataFrame df = jsql.createDataFrame(data, schema);
+ DCT dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false);
+ DataFrame dctDf = dct.transform(df);
+ dctDf.select("featuresDCT").show(3);
+ // $example off$
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
new file mode 100644
index 0000000000..2898accec6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProductExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaElementwiseProductExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ // Create some vector data; also works for sparse vectors
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
+ RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
+ ));
+
+ List<StructField> fields = new ArrayList<StructField>(2);
+ fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
+ fields.add(DataTypes.createStructField("vector", new VectorUDT(), false));
+
+ StructType schema = DataTypes.createStructType(fields);
+
+ DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+ Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+
+ ElementwiseProduct transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector");
+
+ // Batch transform the vectors to create new column:
+ transformer.transform(dataFrame).show();
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
new file mode 100644
index 0000000000..2d50ba7faa
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinMaxScalerExample.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaMinMaxScalerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JaveMinMaxScalerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+ MinMaxScaler scaler = new MinMaxScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures");
+
+ // Compute summary statistics and generate MinMaxScalerModel
+ MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
+
+ // rescale each feature to range [min, max].
+ DataFrame scaledData = scalerModel.transform(dataFrame);
+ scaledData.show();
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
new file mode 100644
index 0000000000..8fd75ed8b5
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGramExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.NGram;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaNGramExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaNGramExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
+ RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
+ RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField(
+ "words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+ });
+
+ DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+ NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+
+ DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
+
+ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
+ java.util.List<String> ngrams = r.getList(0);
+ for (String ngram : ngrams) System.out.print(ngram + " --- ");
+ System.out.println();
+ }
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
new file mode 100644
index 0000000000..ed3f6163c0
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNormalizerExample.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.Normalizer;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaNormalizerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaNormalizerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+ // Normalize each Vector using $L^1$ norm.
+ Normalizer normalizer = new Normalizer()
+ .setInputCol("features")
+ .setOutputCol("normFeatures")
+ .setP(1.0);
+
+ DataFrame l1NormData = normalizer.transform(dataFrame);
+ l1NormData.show();
+
+ // Normalize each Vector using $L^\infty$ norm.
+ DataFrame lInfNormData =
+ normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
+ lInfNormData.show();
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
new file mode 100644
index 0000000000..bc50960708
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoderExample.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.OneHotEncoder;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaOneHotEncoderExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaOneHotEncoderExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("category", DataTypes.StringType, false, Metadata.empty())
+ });
+
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+ StringIndexerModel indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df);
+ DataFrame indexed = indexer.transform(df);
+
+ OneHotEncoder encoder = new OneHotEncoder()
+ .setInputCol("categoryIndex")
+ .setOutputCol("categoryVec");
+ DataFrame encoded = encoder.transform(indexed);
+ encoded.select("id", "categoryVec").show();
+ // $example off$
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
new file mode 100644
index 0000000000..8282fab084
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PCA;
+import org.apache.spark.ml.feature.PCAModel;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPCAExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaPCAExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
+ RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+ RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+
+ DataFrame df = jsql.createDataFrame(data, schema);
+
+ PCAModel pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df);
+
+ DataFrame result = pca.transform(df).select("pcaFeatures");
+ result.show();
+ // $example off$
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
new file mode 100644
index 0000000000..668f71e640
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansionExample.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaPolynomialExpansionExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansionExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ PolynomialExpansion polyExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3);
+
+ JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(-2.0, 2.3)),
+ RowFactory.create(Vectors.dense(0.0, 0.0)),
+ RowFactory.create(Vectors.dense(0.6, -1.1))
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+
+ DataFrame df = jsql.createDataFrame(data, schema);
+ DataFrame polyDF = polyExpansion.transform(df);
+
+ Row[] row = polyDF.select("polyFeatures").take(3);
+ for (Row r : row) {
+ System.out.println(r.get(0));
+ }
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
new file mode 100644
index 0000000000..1e1062b541
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormulaExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaRFormulaExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaRFormulaExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("country", StringType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("clicked", DoubleType, false)
+ });
+
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(7, "US", 18, 1.0),
+ RowFactory.create(8, "CA", 12, 0.0),
+ RowFactory.create(9, "NZ", 15, 0.0)
+ ));
+
+ DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+ RFormula formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label");
+ DataFrame output = formula.fit(dataset).transform(dataset);
+ output.select("features", "label").show();
+ // $example off$
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
new file mode 100644
index 0000000000..da4756643f
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStandardScalerExample.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaStandardScalerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStandardScalerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+ StandardScaler scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false);
+
+ // Compute summary statistics by fitting the StandardScaler
+ StandardScalerModel scalerModel = scaler.fit(dataFrame);
+
+ // Normalize each feature to have unit standard deviation.
+ DataFrame scaledData = scalerModel.transform(dataFrame);
+ scaledData.show();
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
new file mode 100644
index 0000000000..b6b201c6b6
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemoverExample.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaStopWordsRemoverExample {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemoverExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ StopWordsRemover remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered");
+
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField(
+ "raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+ });
+
+ DataFrame dataset = jsql.createDataFrame(rdd, schema);
+ remover.transform(dataset).show();
+ // $example off$
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
new file mode 100644
index 0000000000..05d12c1e70
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexerExample.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaStringIndexerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStringIndexerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+ ));
+ StructType schema = new StructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("category", StringType, false)
+ });
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+ StringIndexer indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex");
+ DataFrame indexed = indexer.fit(df).transform(df);
+ indexed.show();
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
new file mode 100644
index 0000000000..617dc3f66e
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizerExample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RegexTokenizer;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+// $example off$
+
+public class JavaTokenizerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaTokenizerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "Hi I heard about Spark"),
+ RowFactory.create(1, "I wish Java could use case classes"),
+ RowFactory.create(2, "Logistic,regression,models,are,neat")
+ ));
+
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.IntegerType, false, Metadata.empty()),
+ new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+ });
+
+ DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+
+ Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+
+ DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+ for (Row r : wordsDataFrame.select("words", "label"). take(3)) {
+ java.util.List<String> words = r.getList(0);
+ for (String word : words) System.out.print(word + " ");
+ System.out.println();
+ }
+
+ RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ // $example off$
+ jsc.stop();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
new file mode 100644
index 0000000000..7e230b5897
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssemblerExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaVectorAssemblerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorAssemblerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // $example on$
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("mobile", DoubleType, false),
+ createStructField("userFeatures", new VectorUDT(), false),
+ createStructField("clicked", DoubleType, false)
+ });
+ Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+ JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+ DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+ VectorAssembler assembler = new VectorAssembler()
+ .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+ .setOutputCol("features");
+
+ DataFrame output = assembler.transform(dataset);
+ System.out.println(output.select("features", "clicked").first());
+ // $example off$
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
new file mode 100644
index 0000000000..545758e31d
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorIndexerExample.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import java.util.Map;
+
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
+import org.apache.spark.sql.DataFrame;
+// $example off$
+
+public class JavaVectorIndexerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorIndexerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+
+ VectorIndexer indexer = new VectorIndexer()
+ .setInputCol("features")
+ .setOutputCol("indexed")
+ .setMaxCategories(10);
+ VectorIndexerModel indexerModel = indexer.fit(data);
+
+ Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
+ System.out.print("Chose " + categoryMaps.size() + " categorical features:");
+
+ for (Integer feature : categoryMaps.keySet()) {
+ System.out.print(" " + feature);
+ }
+ System.out.println();
+
+ // Create new column "indexed" with categorical values transformed to indices
+ DataFrame indexedData = indexerModel.transform(data);
+ indexedData.show();
+ // $example off$
+ jsc.stop();
+ }
+} \ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
new file mode 100644
index 0000000000..4d5cb04ff5
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicerExample.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+// $example on$
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.attribute.Attribute;
+import org.apache.spark.ml.attribute.AttributeGroup;
+import org.apache.spark.ml.attribute.NumericAttribute;
+import org.apache.spark.ml.feature.VectorSlicer;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaVectorSlicerExample {
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorSlicerExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ // $example on$
+ Attribute[] attrs = new Attribute[]{
+ NumericAttribute.defaultAttr().withName("f1"),
+ NumericAttribute.defaultAttr().withName("f2"),
+ NumericAttribute.defaultAttr().withName("f3")
+ };
+ AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+ JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList(
+ RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+ RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+ ));
+
+ DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+ VectorSlicer vectorSlicer = new VectorSlicer()
+ .setInputCol("userFeatures").setOutputCol("features");
+
+ vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+ // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
+
+ DataFrame output = vectorSlicer.transform(dataset);
+
+ System.out.println(output.select("userFeatures", "features").first());
+ // $example off$
+ jsc.stop();
+ }
+}
+
diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
new file mode 100644
index 0000000000..317cfa638a
--- /dev/null
+++ b/examples/src/main/python/ml/binarizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Binarizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="BinarizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ continuousDataFrame = sqlContext.createDataFrame([
+ (0, 0.1),
+ (1, 0.8),
+ (2, 0.2)
+ ], ["label", "feature"])
+ binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
+ binarizedDataFrame = binarizer.transform(continuousDataFrame)
+ binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+ for binarized_feature, in binarizedFeatures.collect():
+ print(binarized_feature)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
new file mode 100644
index 0000000000..4304255f35
--- /dev/null
+++ b/examples/src/main/python/ml/bucketizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Bucketizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="BucketizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
+
+ data = [(-0.5,), (-0.3,), (0.0,), (0.2,)]
+ dataFrame = sqlContext.createDataFrame(data, ["features"])
+
+ bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")
+
+ # Transform original data into its bucket index.
+ bucketedData = bucketizer.transform(dataFrame)
+ bucketedData.show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py
new file mode 100644
index 0000000000..c85cb0d895
--- /dev/null
+++ b/examples/src/main/python/ml/elementwise_product_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="ElementwiseProductExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+ df = sqlContext.createDataFrame(data, ["vector"])
+ transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
+ inputCol="vector", outputCol="transformedVector")
+ transformer.transform(df).show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/n_gram_example.py b/examples/src/main/python/ml/n_gram_example.py
new file mode 100644
index 0000000000..f2d85f53e7
--- /dev/null
+++ b/examples/src/main/python/ml/n_gram_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import NGram
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="NGramExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ wordDataFrame = sqlContext.createDataFrame([
+ (0, ["Hi", "I", "heard", "about", "Spark"]),
+ (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
+ (2, ["Logistic", "regression", "models", "are", "neat"])
+ ], ["label", "words"])
+ ngram = NGram(inputCol="words", outputCol="ngrams")
+ ngramDataFrame = ngram.transform(wordDataFrame)
+ for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
+ print(ngrams_label)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/normalizer_example.py b/examples/src/main/python/ml/normalizer_example.py
new file mode 100644
index 0000000000..d490221474
--- /dev/null
+++ b/examples/src/main/python/ml/normalizer_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Normalizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="NormalizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ # Normalize each Vector using $L^1$ norm.
+ normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
+ l1NormData = normalizer.transform(dataFrame)
+ l1NormData.show()
+
+ # Normalize each Vector using $L^\infty$ norm.
+ lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
+ lInfNormData.show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/onehot_encoder_example.py b/examples/src/main/python/ml/onehot_encoder_example.py
new file mode 100644
index 0000000000..0f94c26638
--- /dev/null
+++ b/examples/src/main/python/ml/onehot_encoder_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import OneHotEncoder, StringIndexer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="OneHotEncoderExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext.createDataFrame([
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+ ], ["id", "category"])
+
+ stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ model = stringIndexer.fit(df)
+ indexed = model.transform(df)
+ encoder = OneHotEncoder(dropLast=False, inputCol="categoryIndex", outputCol="categoryVec")
+ encoded = encoder.transform(indexed)
+ encoded.select("id", "categoryVec").show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/pca_example.py b/examples/src/main/python/ml/pca_example.py
new file mode 100644
index 0000000000..a17181f1b8
--- /dev/null
+++ b/examples/src/main/python/ml/pca_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="PCAExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+ (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+ (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+ df = sqlContext.createDataFrame(data, ["features"])
+ pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+ model = pca.fit(df)
+ result = model.transform(df).select("pcaFeatures")
+ result.show(truncate=False)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py
new file mode 100644
index 0000000000..3d4fafd1a4
--- /dev/null
+++ b/examples/src/main/python/ml/polynomial_expansion_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import PolynomialExpansion
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="PolynomialExpansionExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext\
+ .createDataFrame([(Vectors.dense([-2.0, 2.3]), ),
+ (Vectors.dense([0.0, 0.0]), ),
+ (Vectors.dense([0.6, -1.1]), )],
+ ["features"])
+ px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
+ polyDF = px.transform(df)
+ for expanded in polyDF.select("polyFeatures").take(3):
+ print(expanded)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/rformula_example.py b/examples/src/main/python/ml/rformula_example.py
new file mode 100644
index 0000000000..b544a14700
--- /dev/null
+++ b/examples/src/main/python/ml/rformula_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import RFormula
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="RFormulaExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataset = sqlContext.createDataFrame(
+ [(7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)],
+ ["id", "country", "hour", "clicked"])
+ formula = RFormula(
+ formula="clicked ~ country + hour",
+ featuresCol="features",
+ labelCol="label")
+ output = formula.fit(dataset).transform(dataset)
+ output.select("features", "label").show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/standard_scaler_example.py b/examples/src/main/python/ml/standard_scaler_example.py
new file mode 100644
index 0000000000..ae7aa85005
--- /dev/null
+++ b/examples/src/main/python/ml/standard_scaler_example.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StandardScaler
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StandardScalerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
+ withStd=True, withMean=False)
+
+ # Compute summary statistics by fitting the StandardScaler
+ scalerModel = scaler.fit(dataFrame)
+
+ # Normalize each feature to have unit standard deviation.
+ scaledData = scalerModel.transform(dataFrame)
+ scaledData.show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/stopwords_remover_example.py b/examples/src/main/python/ml/stopwords_remover_example.py
new file mode 100644
index 0000000000..01f94af8ca
--- /dev/null
+++ b/examples/src/main/python/ml/stopwords_remover_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StopWordsRemover
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StopWordsRemoverExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ sentenceData = sqlContext.createDataFrame([
+ (0, ["I", "saw", "the", "red", "baloon"]),
+ (1, ["Mary", "had", "a", "little", "lamb"])
+ ], ["label", "raw"])
+
+ remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
+ remover.transform(sentenceData).show(truncate=False)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/string_indexer_example.py b/examples/src/main/python/ml/string_indexer_example.py
new file mode 100644
index 0000000000..58a8cb5d56
--- /dev/null
+++ b/examples/src/main/python/ml/string_indexer_example.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import StringIndexer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="StringIndexerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ df = sqlContext.createDataFrame(
+ [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
+ ["id", "category"])
+ indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
+ indexed = indexer.fit(df).transform(df)
+ indexed.show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/tokenizer_example.py b/examples/src/main/python/ml/tokenizer_example.py
new file mode 100644
index 0000000000..ce9b225be5
--- /dev/null
+++ b/examples/src/main/python/ml/tokenizer_example.py
@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import Tokenizer, RegexTokenizer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="TokenizerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ sentenceDataFrame = sqlContext.createDataFrame([
+ (0, "Hi I heard about Spark"),
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
+ ], ["label", "sentence"])
+ tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+ wordsDataFrame = tokenizer.transform(sentenceDataFrame)
+ for words_label in wordsDataFrame.select("words", "label").take(3):
+ print(words_label)
+ regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
+ # alternatively, pattern="\\w+", gaps(False)
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/vector_assembler_example.py b/examples/src/main/python/ml/vector_assembler_example.py
new file mode 100644
index 0000000000..04f64839f1
--- /dev/null
+++ b/examples/src/main/python/ml/vector_assembler_example.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml.feature import VectorAssembler
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="VectorAssemblerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ dataset = sqlContext.createDataFrame(
+ [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
+ ["id", "hour", "mobile", "userFeatures", "clicked"])
+ assembler = VectorAssembler(
+ inputCols=["hour", "mobile", "userFeatures"],
+ outputCol="features")
+ output = assembler.transform(dataset)
+ print(output.select("features", "clicked").first())
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/python/ml/vector_indexer_example.py b/examples/src/main/python/ml/vector_indexer_example.py
new file mode 100644
index 0000000000..146f41c1dd
--- /dev/null
+++ b/examples/src/main/python/ml/vector_indexer_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+from pyspark.sql import SQLContext
+# $example on$
+from pyspark.ml.feature import VectorIndexer
+# $example off$
+
+if __name__ == "__main__":
+ sc = SparkContext(appName="VectorIndexerExample")
+ sqlContext = SQLContext(sc)
+
+ # $example on$
+ data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+ indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
+ indexerModel = indexer.fit(data)
+
+ # Create new column "indexed" with categorical values transformed to indices
+ indexedData = indexerModel.transform(data)
+ indexedData.show()
+ # $example off$
+
+ sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
new file mode 100644
index 0000000000..e724aa5872
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Binarizer
+// $example off$
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.{SparkConf, SparkContext}
+
+object BinarizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("BinarizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+ // $example on$
+ val data = Array((0, 0.1), (1, 0.8), (2, 0.2))
+ val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+
+ val binarizer: Binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5)
+
+ val binarizedDataFrame = binarizer.transform(dataFrame)
+ val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+ binarizedFeatures.collect().foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
new file mode 100644
index 0000000000..7c75e3d72b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Bucketizer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object BucketizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("BucketizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
+
+ val data = Array(-0.5, -0.3, 0.0, 0.2)
+ val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+ val bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits)
+
+ // Transform original data into its bucket index.
+ val bucketedData = bucketizer.transform(dataFrame)
+ bucketedData.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
new file mode 100644
index 0000000000..314c2c28a2
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object DCTExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("DCTExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Seq(
+ Vectors.dense(0.0, 1.0, -2.0, 3.0),
+ Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+ Vectors.dense(14.0, -2.0, -5.0, 1.0))
+
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+ val dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false)
+
+ val dctDf = dct.transform(df)
+ dctDf.select("featuresDCT").show(3)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
new file mode 100644
index 0000000000..872de51dc7
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object ElementwiseProductExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("ElementwiseProductExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ // Create some vector data; also works for sparse vectors
+ val dataFrame = sqlContext.createDataFrame(Seq(
+ ("a", Vectors.dense(1.0, 2.0, 3.0)),
+ ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
+
+ val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+ val transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector")
+
+ // Batch transform the vectors to create new column:
+ transformer.transform(dataFrame).show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
new file mode 100644
index 0000000000..fb7f28c988
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.MinMaxScaler
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object MinMaxScalerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("MinMaxScalerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ val scaler = new MinMaxScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+
+ // Compute summary statistics and generate MinMaxScalerModel
+ val scalerModel = scaler.fit(dataFrame)
+
+ // rescale each feature to range [min, max].
+ val scaledData = scalerModel.transform(dataFrame)
+ scaledData.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
new file mode 100644
index 0000000000..8a85f71b56
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.NGram
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NGramExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("NGramExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val wordDataFrame = sqlContext.createDataFrame(Seq(
+ (0, Array("Hi", "I", "heard", "about", "Spark")),
+ (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+ (2, Array("Logistic", "regression", "models", "are", "neat"))
+ )).toDF("label", "words")
+
+ val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+ val ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
new file mode 100644
index 0000000000..1990b55e8c
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Normalizer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object NormalizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("NormalizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ // Normalize each Vector using $L^1$ norm.
+ val normalizer = new Normalizer()
+ .setInputCol("features")
+ .setOutputCol("normFeatures")
+ .setP(1.0)
+
+ val l1NormData = normalizer.transform(dataFrame)
+ l1NormData.show()
+
+ // Normalize each Vector using $L^\infty$ norm.
+ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+ lInfNormData.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
new file mode 100644
index 0000000000..66602e2118
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object OneHotEncoderExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("OneHotEncoderExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val df = sqlContext.createDataFrame(Seq(
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+ )).toDF("id", "category")
+
+ val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df)
+ val indexed = indexer.transform(df)
+
+ val encoder = new OneHotEncoder()
+ .setInputCol("categoryIndex")
+ .setOutputCol("categoryVec")
+ val encoded = encoder.transform(indexed)
+ encoded.select("id", "categoryVec").show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
new file mode 100644
index 0000000000..4c806f71a3
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PCAExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("PCAExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Array(
+ Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+ Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+ Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+ )
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df)
+ val pcaDF = pca.transform(df)
+ val result = pcaDF.select("pcaFeatures")
+ result.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
new file mode 100644
index 0000000000..39fb79af35
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.PolynomialExpansion
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object PolynomialExpansionExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("PolynomialExpansionExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Array(
+ Vectors.dense(-2.0, 2.3),
+ Vectors.dense(0.0, 0.0),
+ Vectors.dense(0.6, -1.1)
+ )
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val polynomialExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3)
+ val polyDF = polynomialExpansion.transform(df)
+ polyDF.select("polyFeatures").take(3).foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
new file mode 100644
index 0000000000..286866edea
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.RFormula
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object RFormulaExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("RFormulaExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataset = sqlContext.createDataFrame(Seq(
+ (7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)
+ )).toDF("id", "country", "hour", "clicked")
+ val formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label")
+ val output = formula.fit(dataset).transform(dataset)
+ output.select("features", "label").show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
new file mode 100644
index 0000000000..e0a41e383a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StandardScaler
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StandardScalerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("StandardScalerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataFrame = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ val scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false)
+
+ // Compute summary statistics by fitting the StandardScaler.
+ val scalerModel = scaler.fit(dataFrame)
+
+ // Normalize each feature to have unit standard deviation.
+ val scaledData = scalerModel.transform(dataFrame)
+ scaledData.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
new file mode 100644
index 0000000000..655ffce08d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StopWordsRemover
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StopWordsRemoverExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("StopWordsRemoverExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (0, Seq("I", "saw", "the", "red", "baloon")),
+ (1, Seq("Mary", "had", "a", "little", "lamb"))
+ )).toDF("id", "raw")
+
+ remover.transform(dataSet).show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
new file mode 100644
index 0000000000..9fa494cd24
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.StringIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object StringIndexerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("StringIndexerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val df = sqlContext.createDataFrame(
+ Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+ ).toDF("id", "category")
+
+ val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+
+ val indexed = indexer.fit(df).transform(df)
+ indexed.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
new file mode 100644
index 0000000000..01e0d1388a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object TokenizerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("TokenizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+ (0, "Hi I heard about Spark"),
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
+ )).toDF("label", "sentence")
+
+ val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+ val regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+ val tokenized = tokenizer.transform(sentenceDataFrame)
+ tokenized.select("words", "label").take(3).foreach(println)
+ val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+ regexTokenized.select("words", "label").take(3).foreach(println)
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
new file mode 100644
index 0000000000..d527924419
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.VectorAssembler
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object VectorAssemblerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("VectorAssemblerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val dataset = sqlContext.createDataFrame(
+ Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
+ ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+
+ val assembler = new VectorAssembler()
+ .setInputCols(Array("hour", "mobile", "userFeatures"))
+ .setOutputCol("features")
+
+ val output = assembler.transform(dataset)
+ println(output.select("features", "clicked").first())
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
new file mode 100644
index 0000000000..685891c164
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.VectorIndexer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object VectorIndexerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("VectorIndexerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+
+ val indexer = new VectorIndexer()
+ .setInputCol("features")
+ .setOutputCol("indexed")
+ .setMaxCategories(10)
+
+ val indexerModel = indexer.fit(data)
+
+ val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
+ println(s"Chose ${categoricalFeatures.size} categorical features: " +
+ categoricalFeatures.mkString(", "))
+
+ // Create new column "indexed" with categorical values transformed to indices
+ val indexedData = indexerModel.transform(data)
+ indexedData.show()
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
new file mode 100644
index 0000000000..04f19829ef
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSlicerExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
+import org.apache.spark.ml.feature.VectorSlicer
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.StructType
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+object VectorSlicerExample {
+ def main(args: Array[String]): Unit = {
+ val conf = new SparkConf().setAppName("VectorSlicerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // $example on$
+ val data = Array(Row(Vectors.dense(-2.0, 2.3, 0.0)))
+
+ val defaultAttr = NumericAttribute.defaultAttr
+ val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
+ val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])
+
+ val dataRDD = sc.parallelize(data)
+ val dataset = sqlContext.createDataFrame(dataRDD, StructType(Array(attrGroup.toStructField())))
+
+ val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")
+
+ slicer.setIndices(Array(1)).setNames(Array("f3"))
+ // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))
+
+ val output = slicer.transform(dataset)
+ println(output.select("userFeatures", "features").first())
+ // $example off$
+ sc.stop()
+ }
+}
+// scalastyle:on println