aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwm624@hotmail.com <wm624@hotmail.com>2016-05-17 15:20:47 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-17 15:20:47 +0200
commit4134ff0c657efcbf0f61eff0423215afd6132837 (patch)
tree6b26b8824aff57e216974574b30dbfd2d20d3d81
parentc36ca651f9177f8e7a3f6a0098cba5a810ee9deb (diff)
downloadspark-4134ff0c657efcbf0f61eff0423215afd6132837.tar.gz
spark-4134ff0c657efcbf0f61eff0423215afd6132837.tar.bz2
spark-4134ff0c657efcbf0f61eff0423215afd6132837.zip
[SPARK-14434][ML] User guide doc and examples for GaussianMixture in spark.ml
## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) Add guide doc and examples for GaussianMixture in Spark.ml in Java, Scala and Python. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manual compile and test all examples Author: wm624@hotmail.com <wm624@hotmail.com> Closes #12788 from wangmiao1981/example.
-rw-r--r--docs/ml-clustering.md82
-rw-r--r--examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java64
-rw-r--r--examples/src/main/python/ml/gaussian_mixture_example.py48
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala58
4 files changed, 252 insertions, 0 deletions
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index a0955a3855..33e4b7b0d2 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -148,3 +148,85 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.
{% include_example python/ml/bisecting_k_means_example.py %}
</div>
</div>
+
+## Gaussian Mixture Model (GMM)
+
+A [Gaussian Mixture Model](http://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model)
+represents a composite distribution whereby points are drawn from one of *k* Gaussian sub-distributions,
+each with its own probability. The `spark.ml` implementation uses the
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+algorithm to induce the maximum-likelihood model given a set of samples.
+
+`GaussianMixture` is implemented as an `Estimator` and generates a `GaussianMixtureModel` as the base
+model.
+
+### Input Columns
+
+<table class="table">
+ <thead>
+ <tr>
+ <th align="left">Param name</th>
+ <th align="left">Type(s)</th>
+ <th align="left">Default</th>
+ <th align="left">Description</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>featuresCol</td>
+ <td>Vector</td>
+ <td>"features"</td>
+ <td>Feature vector</td>
+ </tr>
+ </tbody>
+</table>
+
+### Output Columns
+
+<table class="table">
+ <thead>
+ <tr>
+ <th align="left">Param name</th>
+ <th align="left">Type(s)</th>
+ <th align="left">Default</th>
+ <th align="left">Description</th>
+ </tr>
+ </thead>
+ <tbody>
+ <tr>
+ <td>predictionCol</td>
+ <td>Int</td>
+ <td>"prediction"</td>
+ <td>Predicted cluster center</td>
+ </tr>
+ <tr>
+ <td>probabilityCol</td>
+ <td>Vector</td>
+ <td>"probability"</td>
+ <td>Probability of each cluster</td>
+ </tr>
+ </tbody>
+</table>
+
+### Example
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.GaussianMixture) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/GaussianMixture.html) for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.GaussianMixture) for more details.
+
+{% include_example python/ml/gaussian_mixture_example.py %}
+</div>
+</div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
new file mode 100644
index 0000000000..79b9909581
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaGaussianMixtureExample.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import org.apache.spark.ml.clustering.GaussianMixture;
+import org.apache.spark.ml.clustering.GaussianMixtureModel;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+// $example off$
+import org.apache.spark.sql.SparkSession;
+
+
+/**
+ * An example demonstrating Gaussian Mixture Model.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaGaussianMixtureExample
+ * </pre>
+ */
+public class JavaGaussianMixtureExample {
+
+ public static void main(String[] args) {
+
+ // Creates a SparkSession
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("JavaGaussianMixtureExample")
+ .getOrCreate();
+
+ // $example on$
+ // Loads data
+ Dataset<Row> dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt");
+
+ // Trains a GaussianMixture model
+ GaussianMixture gmm = new GaussianMixture()
+ .setK(2);
+ GaussianMixtureModel model = gmm.fit(dataset);
+
+ // Output the parameters of the mixture model
+ for (int i = 0; i < model.getK(); i++) {
+ System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n",
+ model.weights()[i], model.gaussians()[i].mean(), model.gaussians()[i].cov());
+ }
+ // $example off$
+
+ spark.stop();
+ }
+}
diff --git a/examples/src/main/python/ml/gaussian_mixture_example.py b/examples/src/main/python/ml/gaussian_mixture_example.py
new file mode 100644
index 0000000000..2ca13d68f6
--- /dev/null
+++ b/examples/src/main/python/ml/gaussian_mixture_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.clustering import GaussianMixture
+# $example off$
+from pyspark.sql import SparkSession
+
+"""
+A simple example demonstrating Gaussian Mixture Model (GMM).
+Run with:
+ bin/spark-submit examples/src/main/python/ml/gaussian_mixture_example.py
+"""
+
+if __name__ == "__main__":
+ spark = SparkSession\
+ .builder\
+ .appName("PythonGuassianMixtureExample")\
+ .getOrCreate()
+
+ # $example on$
+ # loads data
+ dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
+
+ gmm = GaussianMixture().setK(2)
+ model = gmm.fit(dataset)
+
+ print("Gaussians: ")
+ model.gaussiansDF.show()
+ # $example off$
+
+ spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
new file mode 100644
index 0000000000..c484ee5556
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+// scalastyle:off println
+
+// $example on$
+import org.apache.spark.ml.clustering.GaussianMixture
+import org.apache.spark.sql.SparkSession
+// $example off$
+
+/**
+ * An example demonstrating Gaussian Mixture Model (GMM).
+ * Run with
+ * {{{
+ * bin/run-example ml.GaussianMixtureExample
+ * }}}
+ */
+object GaussianMixtureExample {
+ def main(args: Array[String]): Unit = {
+ // Creates a SparkSession
+ val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").getOrCreate()
+
+ // $example on$
+ // Loads data
+ val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
+
+ // Trains Gaussian Mixture Model
+ val gmm = new GaussianMixture()
+ .setK(2)
+ val model = gmm.fit(dataset)
+
+ // output parameters of mixture model model
+ for (i <- 0 until model.getK) {
+ println("weight=%f\nmu=%s\nsigma=\n%s\n" format
+ (model.weights(i), model.gaussians(i).mean, model.gaussians(i).cov))
+ }
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println