15 files changed, 861 insertions, 361 deletions
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 7796bac697..7a97285032 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -58,46 +58,7 @@ Each record could be an iterable of strings or other types.
 
 Refer to the [`HashingTF` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.HashingTF) for details on the API.
 
-
-{% highlight scala %}
-import org.apache.spark.rdd.RDD
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.feature.HashingTF
-import org.apache.spark.mllib.linalg.Vector
-
-val sc: SparkContext = ...
-
-// Load documents (one per line).
-val documents: RDD[Seq[String]] = sc.textFile("...").map(_.split(" ").toSeq)
-
-val hashingTF = new HashingTF()
-val tf: RDD[Vector] = hashingTF.transform(documents)
-{% endhighlight %}
-
-While applying `HashingTF` only needs a single pass to the data, applying `IDF` needs two passes: 
-first to compute the IDF vector and second to scale the term frequencies by IDF.
-
-{% highlight scala %}
-import org.apache.spark.mllib.feature.IDF
-
-// ... continue from the previous example
-tf.cache()
-val idf = new IDF().fit(tf)
-val tfidf: RDD[Vector] = idf.transform(tf)
-{% endhighlight %}
-
-`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a
-minimum number of documents.  In such cases, the IDF for these terms is set to 0.  This feature
-can be used by passing the `minDocFreq` value to the IDF constructor.
-
-{% highlight scala %}
-import org.apache.spark.mllib.feature.IDF
-
-// ... continue from the previous example
-tf.cache()
-val idf = new IDF(minDocFreq = 2).fit(tf)
-val tfidf: RDD[Vector] = idf.transform(tf)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/TFIDFExample.scala %}
 </div>
 <div data-lang="python" markdown="1">
 
@@ -109,41 +70,7 @@ Each record could be an iterable of strings or other types.
 
 Refer to the [`HashingTF` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.HashingTF) for details on the API.
 
-{% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.feature import HashingTF
-
-sc = SparkContext()
-
-# Load documents (one per line).
-documents = sc.textFile("...").map(lambda line: line.split(" "))
-
-hashingTF = HashingTF()
-tf = hashingTF.transform(documents)
-{% endhighlight %}
-
-While applying `HashingTF` only needs a single pass to the data, applying `IDF` needs two passes: 
-first to compute the IDF vector and second to scale the term frequencies by IDF.
-
-{% highlight python %}
-from pyspark.mllib.feature import IDF
-
-# ... continue from the previous example
-tf.cache()
-idf = IDF().fit(tf)
-tfidf = idf.transform(tf)
-{% endhighlight %}
-
-`spark.mllib`'s IDF implementation provides an option for ignoring terms which occur in less than a
-minimum number of documents.  In such cases, the IDF for these terms is set to 0.  This feature
-can be used by passing the `minDocFreq` value to the IDF constructor.
-
-{% highlight python %}
-# ... continue from the previous example
-tf.cache()
-idf = IDF(minDocFreq=2).fit(tf)
-tfidf = idf.transform(tf)
-{% endhighlight %}
+{% include_example python/mllib/tf_idf_example.py %}
 </div>
 </div>
 
@@ -192,47 +119,12 @@ Here we assume the extracted file is `text8` and in same directory as you run th
 <div data-lang="scala" markdown="1">
 Refer to the [`Word2Vec` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Word2Vec) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark._
-import org.apache.spark.rdd._
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
-
-val input = sc.textFile("text8").map(line => line.split(" ").toSeq)
-
-val word2vec = new Word2Vec()
-
-val model = word2vec.fit(input)
-
-val synonyms = model.findSynonyms("china", 40)
-
-for((synonym, cosineSimilarity) <- synonyms) {
-  println(s"$synonym $cosineSimilarity")
-}
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = Word2VecModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/Word2VecExample.scala %}
 </div>
 <div data-lang="python" markdown="1">
 Refer to the [`Word2Vec` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Word2Vec) for more details on the API.
 
-{% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.feature import Word2Vec
-
-sc = SparkContext(appName='Word2Vec')
-inp = sc.textFile("text8_lines").map(lambda row: row.split(" "))
-
-word2vec = Word2Vec()
-model = word2vec.fit(inp)
-
-synonyms = model.findSynonyms('china', 40)
-
-for word, cosine_distance in synonyms:
-    print("{}: {}".format(word, cosine_distance))
-{% endhighlight %}
+{% include_example python/mllib/word2vec_example.py %}
 </div>
 </div>
 
@@ -277,55 +169,13 @@ so that the new features have unit standard deviation and/or zero mean.
 <div data-lang="scala" markdown="1">
 Refer to the [`StandardScaler` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.feature.StandardScaler
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.util.MLUtils
-
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-
-val scaler1 = new StandardScaler().fit(data.map(x => x.features))
-val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
-// scaler3 is an identical model to scaler2, and will produce identical transformations
-val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
-
-// data1 will be unit variance.
-val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
-
-// Without converting the features into dense vectors, transformation with zero mean will raise
-// exception on sparse vector.
-// data2 will be unit variance and zero mean.
-val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/StandardScalerExample.scala %}
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`StandardScaler` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.StandardScaler) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.util import MLUtils
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.feature import StandardScaler
-
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-label = data.map(lambda x: x.label)
-features = data.map(lambda x: x.features)
-
-scaler1 = StandardScaler().fit(features)
-scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
-# scaler3 is an identical model to scaler2, and will produce identical transformations
-scaler3 = StandardScalerModel(scaler2.std, scaler2.mean)
-
-
-# data1 will be unit variance.
-data1 = label.zip(scaler1.transform(features))
-
-# Without converting the features into dense vectors, transformation with zero mean will raise
-# exception on sparse vector.
-# data2 will be unit variance and zero mean.
-data2 = label.zip(scaler1.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
-{% endhighlight %}
+{% include_example python/mllib/standard_scaler_example.py %}
 </div>
 </div>
 
@@ -355,46 +205,13 @@ with $L^2$ norm, and $L^\infty$ norm.
 <div data-lang="scala" markdown="1">
 Refer to the [`Normalizer` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.Normalizer) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.feature.Normalizer
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.util.MLUtils
-
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-
-val normalizer1 = new Normalizer()
-val normalizer2 = new Normalizer(p = Double.PositiveInfinity)
-
-// Each sample in data1 will be normalized using $L^2$ norm.
-val data1 = data.map(x => (x.label, normalizer1.transform(x.features)))
-
-// Each sample in data2 will be normalized using $L^\infty$ norm.
-val data2 = data.map(x => (x.label, normalizer2.transform(x.features)))
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/NormalizerExample.scala %}
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`Normalizer` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.Normalizer) for more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.util import MLUtils
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.feature import Normalizer
-
-data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-labels = data.map(lambda x: x.label)
-features = data.map(lambda x: x.features)
-
-normalizer1 = Normalizer()
-normalizer2 = Normalizer(p=float("inf"))
-
-# Each sample in data1 will be normalized using $L^2$ norm.
-data1 = labels.zip(normalizer1.transform(features))
-
-# Each sample in data2 will be normalized using $L^\infty$ norm.
-data2 = labels.zip(normalizer2.transform(features))
-{% endhighlight %}
+{% include_example python/mllib/normalizer_example.py %}
 </div>
 </div>
 
@@ -435,29 +252,7 @@ The following example shows the basic use of ChiSqSelector. The data set used ha
 Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector)
 for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.mllib.feature.ChiSqSelector
-
-// Load some data in libsvm format
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-// Discretize data in 16 equal bins since ChiSqSelector requires categorical features
-// Even though features are doubles, the ChiSqSelector treats each unique value as a category
-val discretizedData = data.map { lp =>
-  LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor } ) )
-}
-// Create ChiSqSelector that will select top 50 of 692 features
-val selector = new ChiSqSelector(50)
-// Create ChiSqSelector model (selecting features)
-val transformer = selector.fit(discretizedData)
-// Filter the top 50 features from each feature vector
-val filteredData = discretizedData.map { lp => 
-  LabeledPoint(lp.label, transformer.transform(lp.features)) 
-}
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -465,52 +260,7 @@ val filteredData = discretizedData.map { lp =>
 Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html)
 for details on the API.
 
-{% highlight java %}
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.feature.ChiSqSelector;
-import org.apache.spark.mllib.feature.ChiSqSelectorModel;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-
-SparkConf sparkConf = new SparkConf().setAppName("JavaChiSqSelector");
-JavaSparkContext sc = new JavaSparkContext(sparkConf);
-JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(sc.sc(),
-    "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache();
-
-// Discretize data in 16 equal bins since ChiSqSelector requires categorical features
-// Even though features are doubles, the ChiSqSelector treats each unique value as a category
-JavaRDD<LabeledPoint> discretizedData = points.map(
-    new Function<LabeledPoint, LabeledPoint>() {
-      @Override
-      public LabeledPoint call(LabeledPoint lp) {
-        final double[] discretizedFeatures = new double[lp.features().size()];
-        for (int i = 0; i < lp.features().size(); ++i) {
-          discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16);
-        }
-        return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures));
-      }
-    });
-
-// Create ChiSqSelector that will select top 50 of 692 features
-ChiSqSelector selector = new ChiSqSelector(50);
-// Create ChiSqSelector model (selecting features)
-final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd());
-// Filter the top 50 features from each feature vector
-JavaRDD<LabeledPoint> filteredData = discretizedData.map(
-    new Function<LabeledPoint, LabeledPoint>() {
-      @Override
-      public LabeledPoint call(LabeledPoint lp) {
-        return new LabeledPoint(lp.label(), transformer.transform(lp.features()));
-      }
-    }
-);
-
-sc.stop();
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %}
 </div>
 </div>
 
@@ -554,78 +304,19 @@ This example below demonstrates how to transform vectors using a transforming ve
 
 Refer to the [`ElementwiseProduct` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ElementwiseProduct) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.feature.ElementwiseProduct
-import org.apache.spark.mllib.linalg.Vectors
-
-// Create some vector data; also works for sparse vectors
-val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))
-
-val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
-val transformer = new ElementwiseProduct(transformingVector)
-
-// Batch transform and per-row transform give the same results:
-val transformedData = transformer.transform(data)
-val transformedData2 = data.map(x => transformer.transform(x))
-
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`ElementwiseProduct` Java docs](api/java/org/apache/spark/mllib/feature/ElementwiseProduct.html) for details on the API.
 
-{% highlight java %}
-import java.util.Arrays;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.feature.ElementwiseProduct;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.Vectors;
-
-// Create some vector data; also works for sparse vectors
-JavaRDD<Vector> data = sc.parallelize(Arrays.asList(
-  Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)));
-Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
-ElementwiseProduct transformer = new ElementwiseProduct(transformingVector);
-
-// Batch transform and per-row transform give the same results:
-JavaRDD<Vector> transformedData = transformer.transform(data);
-JavaRDD<Vector> transformedData2 = data.map(
-  new Function<Vector, Vector>() {
-    @Override
-    public Vector call(Vector v) {
-      return transformer.transform(v);
-    }
-  }
-);
-
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`ElementwiseProduct` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ElementwiseProduct) for more details on the API.
 
-{% highlight python %}
-from pyspark import SparkContext
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.feature import ElementwiseProduct
-
-# Load and parse the data
-sc = SparkContext()
-data = sc.textFile("data/mllib/kmeans_data.txt")
-parsedData = data.map(lambda x: [float(t) for t in x.split(" ")])
-
-# Create weight vector.
-transformingVector = Vectors.dense([0.0, 1.0, 2.0])
-transformer = ElementwiseProduct(transformingVector)
-
-# Batch transform
-transformedData = transformer.transform(parsedData)
-# Single-row transform
-transformedData2 = transformer.transform(parsedData.first())
-
-{% endhighlight %}
+{% include_example python/mllib/elementwise_product_example.py %}
 </div>
 </div>
 
@@ -645,44 +336,6 @@ for calculation a [Linear Regression]((mllib-linear-methods.html))
 <div data-lang="scala" markdown="1">
 Refer to the [`PCA` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.PCA) for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.regression.LinearRegressionWithSGD
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.feature.PCA
-
-val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
-  val parts = line.split(',')
-  LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
-}.cache()
-
-val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
-val training = splits(0).cache()
-val test = splits(1)
-
-val pca = new PCA(training.first().features.size/2).fit(data.map(_.features))
-val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
-val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))
-
-val numIterations = 100
-val model = LinearRegressionWithSGD.train(training, numIterations)
-val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)
-
-val valuesAndPreds = test.map { point =>
-  val score = model.predict(point.features)
-  (score, point.label)
-}
-
-val valuesAndPreds_pca = test_pca.map { point =>
-  val score = model_pca.predict(point.features)
-  (score, point.label)
-}
-
-val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
-val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean()
-
-println("Mean Squared Error = " + MSE)
-println("PCA Mean Squared Error = " + MSE_pca)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/mllib/PCAExample.scala %}
 </div>
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
new file mode 100644
index 0000000000..ad44acb4cd
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.VoidFunction;
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.feature.ChiSqSelector;
+import org.apache.spark.mllib.feature.ChiSqSelectorModel;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+// $example off$
+
+public class JavaChiSqSelectorExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(jsc.sc(),
+      "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache();
+
+    // Discretize data in 16 equal bins since ChiSqSelector requires categorical features
+    // Although features are doubles, the ChiSqSelector treats each unique value as a category
+    JavaRDD<LabeledPoint> discretizedData = points.map(
+      new Function<LabeledPoint, LabeledPoint>() {
+        @Override
+        public LabeledPoint call(LabeledPoint lp) {
+          final double[] discretizedFeatures = new double[lp.features().size()];
+          for (int i = 0; i < lp.features().size(); ++i) {
+            discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16);
+          }
+          return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures));
+        }
+      }
+    );
+
+    // Create ChiSqSelector that will select top 50 of 692 features
+    ChiSqSelector selector = new ChiSqSelector(50);
+    // Create ChiSqSelector model (selecting features)
+    final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd());
+    // Filter the top 50 features from each feature vector
+    JavaRDD<LabeledPoint> filteredData = discretizedData.map(
+      new Function<LabeledPoint, LabeledPoint>() {
+        @Override
+        public LabeledPoint call(LabeledPoint lp) {
+          return new LabeledPoint(lp.label(), transformer.transform(lp.features()));
+        }
+      }
+    );
+    // $example off$
+
+    System.out.println("filtered data: ");
+    filteredData.foreach(new VoidFunction<LabeledPoint>() {
+      @Override
+      public void call(LabeledPoint labeledPoint) throws Exception {
+        System.out.println(labeledPoint.toString());
+      }
+    });
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java
new file mode 100644
index 0000000000..c8ce6ab284
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaElementwiseProductExample.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.Arrays;
+// $example off$
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.mllib.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off$
+import org.apache.spark.api.java.function.VoidFunction;
+
+public class JavaElementwiseProductExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaElementwiseProductExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+
+    // $example on$
+    // Create some vector data; also works for sparse vectors
+    JavaRDD<Vector> data = jsc.parallelize(Arrays.asList(
+      Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)));
+    Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+    final ElementwiseProduct transformer = new ElementwiseProduct(transformingVector);
+
+    // Batch transform and per-row transform give the same results:
+    JavaRDD<Vector> transformedData = transformer.transform(data);
+    JavaRDD<Vector> transformedData2 = data.map(
+      new Function<Vector, Vector>() {
+        @Override
+        public Vector call(Vector v) {
+          return transformer.transform(v);
+        }
+      }
+    );
+    // $example off$
+
+    System.out.println("transformedData: ");
+    transformedData.foreach(new VoidFunction<Vector>() {
+      @Override
+      public void call(Vector vector) throws Exception {
+        System.out.println(vector.toString());
+      }
+    });
+
+    System.out.println("transformedData2: ");
+    transformedData2.foreach(new VoidFunction<Vector>() {
+      @Override
+      public void call(Vector vector) throws Exception {
+        System.out.println(vector.toString());
+      }
+    });
+
+    jsc.stop();
+  }
+}
diff --git a/examples/src/main/python/mllib/elementwise_product_example.py b/examples/src/main/python/mllib/elementwise_product_example.py
new file mode 100644
index 0000000000..6d8bf6d42e
--- /dev/null
+++ b/examples/src/main/python/mllib/elementwise_product_example.py
@@ -0,0 +1,51 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="ElementwiseProductExample")  # SparkContext
+
+    # $example on$
+    data = sc.textFile("data/mllib/kmeans_data.txt")
+    parsedData = data.map(lambda x: [float(t) for t in x.split(" ")])
+
+    # Create weight vector.
+    transformingVector = Vectors.dense([0.0, 1.0, 2.0])
+    transformer = ElementwiseProduct(transformingVector)
+
+    # Batch transform
+    transformedData = transformer.transform(parsedData)
+    # Single-row transform
+    transformedData2 = transformer.transform(parsedData.first())
+    # $example off$
+
+    print("transformedData:")
+    for each in transformedData.collect():
+        print(each)
+
+    print("transformedData2:")
+    for each in transformedData2.collect():
+        print(each)
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/normalizer_example.py b/examples/src/main/python/mllib/normalizer_example.py
new file mode 100644
index 0000000000..a4e028ca9a
--- /dev/null
+++ b/examples/src/main/python/mllib/normalizer_example.py
@@ -0,0 +1,52 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.feature import Normalizer
+from pyspark.mllib.util import MLUtils
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="NormalizerExample")  # SparkContext
+
+    # $example on$
+    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    labels = data.map(lambda x: x.label)
+    features = data.map(lambda x: x.features)
+
+    normalizer1 = Normalizer()
+    normalizer2 = Normalizer(p=float("inf"))
+
+    # Each sample in data1 will be normalized using $L^2$ norm.
+    data1 = labels.zip(normalizer1.transform(features))
+
+    # Each sample in data2 will be normalized using $L^\infty$ norm.
+    data2 = labels.zip(normalizer2.transform(features))
+    # $example off$
+
+    print("data1:")
+    for each in data1.collect():
+        print(each)
+
+    print("data2:")
+    for each in data2.collect():
+        print(each)
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/standard_scaler_example.py b/examples/src/main/python/mllib/standard_scaler_example.py
new file mode 100644
index 0000000000..20a77a4708
--- /dev/null
+++ b/examples/src/main/python/mllib/standard_scaler_example.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.feature import StandardScaler, StandardScalerModel
+from pyspark.mllib.linalg import Vectors
+from pyspark.mllib.util import MLUtils
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="StandardScalerExample")  # SparkContext
+
+    # $example on$
+    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    label = data.map(lambda x: x.label)
+    features = data.map(lambda x: x.features)
+
+    scaler1 = StandardScaler().fit(features)
+    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
+
+    # data1 will be unit variance.
+    data1 = label.zip(scaler1.transform(features))
+
+    # Without converting the features into dense vectors, transformation with zero mean will raise
+    # exception on sparse vector.
+    # data2 will be unit variance and zero mean.
+    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
+    # $example off$
+
+    print("data1:")
+    for each in data1.collect():
+        print(each)
+
+    print("data2:")
+    for each in data2.collect():
+        print(each)
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/tf_idf_example.py b/examples/src/main/python/mllib/tf_idf_example.py
new file mode 100644
index 0000000000..c4d53333a9
--- /dev/null
+++ b/examples/src/main/python/mllib/tf_idf_example.py
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.feature import HashingTF, IDF
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="TFIDFExample")  # SparkContext
+
+    # $example on$
+    # Load documents (one per line).
+    documents = sc.textFile("data/mllib/kmeans_data.txt").map(lambda line: line.split(" "))
+
+    hashingTF = HashingTF()
+    tf = hashingTF.transform(documents)
+
+    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
+    # First to compute the IDF vector and second to scale the term frequencies by IDF.
+    tf.cache()
+    idf = IDF().fit(tf)
+    tfidf = idf.transform(tf)
+
+    # spark.mllib's IDF implementation provides an option for ignoring terms
+    # which occur in less than a minimum number of documents.
+    # In such cases, the IDF for these terms is set to 0.
+    # This feature can be used by passing the minDocFreq value to the IDF constructor.
+    idfIgnore = IDF(minDocFreq=2).fit(tf)
+    tfidfIgnore = idf.transform(tf)
+    # $example off$
+
+    print("tfidf:")
+    for each in tfidf.collect():
+        print(each)
+
+    print("tfidfIgnore:")
+    for each in tfidfIgnore.collect():
+        print(each)
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/word2vec_example.py b/examples/src/main/python/mllib/word2vec_example.py
new file mode 100644
index 0000000000..ad1090c77e
--- /dev/null
+++ b/examples/src/main/python/mllib/word2vec_example.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.feature import Word2Vec
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="Word2VecExample")  # SparkContext
+
+    # $example on$
+    inp = sc.textFile("data/mllib/sample_lda_data.txt").map(lambda row: row.split(" "))
+
+    word2vec = Word2Vec()
+    model = word2vec.fit(inp)
+
+    synonyms = model.findSynonyms('1', 5)
+
+    for word, cosine_distance in synonyms:
+        print("{}: {}".format(word, cosine_distance))
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
new file mode 100644
index 0000000000..5e400b7d71
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.ChiSqSelector
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+
+object ChiSqSelectorExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("ChiSqSelectorExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    // Load some data in libsvm format
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    // Discretize data in 16 equal bins since ChiSqSelector requires categorical features
+    // Even though features are doubles, the ChiSqSelector treats each unique value as a category
+    val discretizedData = data.map { lp =>
+      LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor }))
+    }
+    // Create ChiSqSelector that will select top 50 of 692 features
+    val selector = new ChiSqSelector(50)
+    // Create ChiSqSelector model (selecting features)
+    val transformer = selector.fit(discretizedData)
+    // Filter the top 50 features from each feature vector
+    val filteredData = discretizedData.map { lp =>
+      LabeledPoint(lp.label, transformer.transform(lp.features))
+    }
+    // $example off$
+
+    println("filtered data: ")
+    filteredData.foreach(x => println(x))
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala
new file mode 100644
index 0000000000..1e4e354319
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ElementwiseProductExample.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+// $example off$
+
+object ElementwiseProductExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("ElementwiseProductExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    // Create some vector data; also works for sparse vectors
+    val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))
+
+    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+    val transformer = new ElementwiseProduct(transformingVector)
+
+    // Batch transform and per-row transform give the same results:
+    val transformedData = transformer.transform(data)
+    val transformedData2 = data.map(x => transformer.transform(x))
+    // $example off$
+
+    println("transformedData: ")
+    transformedData.foreach(x => println(x))
+
+    println("transformedData2: ")
+    transformedData2.foreach(x => println(x))
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala
new file mode 100644
index 0000000000..b3a9604c2b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/NormalizerExample.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.Normalizer
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+
+object NormalizerExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("NormalizerExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+
+    val normalizer1 = new Normalizer()
+    val normalizer2 = new Normalizer(p = Double.PositiveInfinity)
+
+    // Each sample in data1 will be normalized using $L^2$ norm.
+    val data1 = data.map(x => (x.label, normalizer1.transform(x.features)))
+
+    // Each sample in data2 will be normalized using $L^\infty$ norm.
+    val data2 = data.map(x => (x.label, normalizer2.transform(x.features)))
+    // $example off$
+
+    println("data1: ")
+    data1.foreach(x => println(x))
+
+    println("data2: ")
+    data2.foreach(x => println(x))
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
new file mode 100644
index 0000000000..f7a8136953
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
+// $example off$
+
+object PCAExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("PCAExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
+      val parts = line.split(',')
+      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
+    }.cache()
+
+    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+    val training = splits(0).cache()
+    val test = splits(1)
+
+    val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features))
+    val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
+    val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))
+
+    val numIterations = 100
+    val model = LinearRegressionWithSGD.train(training, numIterations)
+    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)
+
+    val valuesAndPreds = test.map { point =>
+      val score = model.predict(point.features)
+      (score, point.label)
+    }
+
+    val valuesAndPreds_pca = test_pca.map { point =>
+      val score = model_pca.predict(point.features)
+      (score, point.label)
+    }
+
+    val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
+    val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()
+
+    println("Mean Squared Error = " + MSE)
+    println("PCA Mean Squared Error = " + MSE_pca)
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala
new file mode 100644
index 0000000000..fc0aa1b7f0
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StandardScalerExample.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+
+object StandardScalerExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("StandardScalerExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+
+    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
+    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
+    // scaler3 is an identical model to scaler2, and will produce identical transformations
+    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
+
+    // data1 will be unit variance.
+    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
+
+    // Without converting the features into dense vectors, transformation with zero mean will raise
+    // exception on sparse vector.
+    // data2 will be unit variance and zero mean.
+    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
+    // $example off$
+
+    println("data1: ")
+    data1.foreach(x => println(x))
+
+    println("data2: ")
+    data2.foreach(x => println(x))
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala
new file mode 100644
index 0000000000..a5bdcd8f2e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TFIDFExample.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.{HashingTF, IDF}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.rdd.RDD
+// $example off$
+
+object TFIDFExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("TFIDFExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    // Load documents (one per line).
+    val documents: RDD[Seq[String]] = sc.textFile("data/mllib/kmeans_data.txt")
+      .map(_.split(" ").toSeq)
+
+    val hashingTF = new HashingTF()
+    val tf: RDD[Vector] = hashingTF.transform(documents)
+
+    // While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
+    // First to compute the IDF vector and second to scale the term frequencies by IDF.
+    tf.cache()
+    val idf = new IDF().fit(tf)
+    val tfidf: RDD[Vector] = idf.transform(tf)
+
+    // spark.mllib IDF implementation provides an option for ignoring terms which occur in less than
+    // a minimum number of documents. In such cases, the IDF for these terms is set to 0.
+    // This feature can be used by passing the minDocFreq value to the IDF constructor.
+    val idfIgnore = new IDF(minDocFreq = 2).fit(tf)
+    val tfidfIgnore: RDD[Vector] = idfIgnore.transform(tf)
+    // $example off$
+
+    println("tfidf: ")
+    tfidf.foreach(x => println(x))
+
+    println("tfidfIgnore: ")
+    tfidfIgnore.foreach(x => println(x))
+
+    sc.stop()
+  }
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala
new file mode 100644
index 0000000000..ea794c700a
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/Word2VecExample.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
+// $example on$
+import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
+// $example off$
+
+object Word2VecExample {
+
+  def main(args: Array[String]): Unit = {
+
+    val conf = new SparkConf().setAppName("Word2VecExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    val input = sc.textFile("data/mllib/sample_lda_data.txt").map(line => line.split(" ").toSeq)
+
+    val word2vec = new Word2Vec()
+
+    val model = word2vec.fit(input)
+
+    val synonyms = model.findSynonyms("1", 5)
+
+    for((synonym, cosineSimilarity) <- synonyms) {
+      println(s"$synonym $cosineSimilarity")
+    }
+
+    // Save and load model
+    model.save(sc, "myModelPath")
+    val sameModel = Word2VecModel.load(sc, "myModelPath")
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println