aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorTravis Galoppo <tjg2107@columbia.edu>2015-01-11 21:31:16 -0800
committerXiangrui Meng <meng@databricks.com>2015-01-11 21:31:16 -0800
commit2130de9d8f50f52b9b2d296b377df81d840546b3 (patch)
tree07b9c8fcce82d15690ad7cba68ea024fa409980c /mllib
parentf38ef6586c2980183c983b2aa14a5ddc1856b7b7 (diff)
downloadspark-2130de9d8f50f52b9b2d296b377df81d840546b3.tar.gz
spark-2130de9d8f50f52b9b2d296b377df81d840546b3.tar.bz2
spark-2130de9d8f50f52b9b2d296b377df81d840546b3.zip
SPARK-5018 [MLlib] [WIP] Make MultivariateGaussian public
Moving MutlivariateGaussian from private[mllib] to public. The class uses Breeze vectors internally, so this involves creating a public interface using MLlib vectors and matrices. This initial commit provides public construction, accessors for mean/covariance, density and log-density. Other potential methods include entropy and sample generation. Author: Travis Galoppo <tjg2107@columbia.edu> Closes #3923 from tgaloppo/spark-5018 and squashes the following commits: 2b15587 [Travis Galoppo] Style correction b4121b4 [Travis Galoppo] Merge remote-tracking branch 'upstream/master' into spark-5018 e30a100 [Travis Galoppo] Made mu, sigma private[mllib] members of MultivariateGaussian Moved MultivariateGaussian (and test suite) from stat.impl to stat.distribution (required updates in GaussianMixture{EM,Model}.scala) Marked MultivariateGaussian as @DeveloperApi Fixed style error 9fa3bb7 [Travis Galoppo] Style improvements 91a5fae [Travis Galoppo] Rearranged equation for part of density function 8c35381 [Travis Galoppo] Fixed accessor methods to match member variable names. Modified calculations to avoid log(pow(x,y)) calculations 0943dc4 [Travis Galoppo] SPARK-5018 4dee9e1 [Travis Galoppo] SPARK-5018
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala (renamed from mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala)70
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala (renamed from mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala)33
4 files changed, 75 insertions, 41 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
index b3c5631cc4..d8e1346194 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
@@ -20,10 +20,11 @@ package org.apache.spark.mllib.clustering
import scala.collection.mutable.IndexedSeq
import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose}
-import org.apache.spark.rdd.RDD
+
import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS}
-import org.apache.spark.mllib.stat.impl.MultivariateGaussian
+import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
/**
@@ -134,7 +135,7 @@ class GaussianMixtureEM private (
// derived from the samples
val (weights, gaussians) = initialModel match {
case Some(gmm) => (gmm.weight, gmm.mu.zip(gmm.sigma).map { case(mu, sigma) =>
- new MultivariateGaussian(mu.toBreeze.toDenseVector, sigma.toBreeze.toDenseMatrix)
+ new MultivariateGaussian(mu, sigma)
})
case None => {
@@ -176,8 +177,8 @@ class GaussianMixtureEM private (
}
// Need to convert the breeze matrices to MLlib matrices
- val means = Array.tabulate(k) { i => Vectors.fromBreeze(gaussians(i).mu) }
- val sigmas = Array.tabulate(k) { i => Matrices.fromBreeze(gaussians(i).sigma) }
+ val means = Array.tabulate(k) { i => gaussians(i).mu }
+ val sigmas = Array.tabulate(k) { i => gaussians(i).sigma }
new GaussianMixtureModel(weights, means, sigmas)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index b461ea4f0f..416cad080c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -21,7 +21,7 @@ import breeze.linalg.{DenseVector => BreezeVector}
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Matrix, Vector}
-import org.apache.spark.mllib.stat.impl.MultivariateGaussian
+import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
import org.apache.spark.mllib.util.MLUtils
/**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
index bc7f6c5197..fd186b5ee6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussian.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussian.scala
@@ -15,13 +15,16 @@
* limitations under the License.
*/
-package org.apache.spark.mllib.stat.impl
+package org.apache.spark.mllib.stat.distribution
import breeze.linalg.{DenseVector => DBV, DenseMatrix => DBM, diag, max, eigSym}
+import org.apache.spark.annotation.DeveloperApi;
+import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix}
import org.apache.spark.mllib.util.MLUtils
/**
+ * :: DeveloperApi ::
* This class provides basic functionality for a Multivariate Gaussian (Normal) Distribution. In
* the event that the covariance matrix is singular, the density will be computed in a
* reduced dimensional subspace under which the distribution is supported.
@@ -30,33 +33,64 @@ import org.apache.spark.mllib.util.MLUtils
* @param mu The mean vector of the distribution
* @param sigma The covariance matrix of the distribution
*/
-private[mllib] class MultivariateGaussian(
- val mu: DBV[Double],
- val sigma: DBM[Double]) extends Serializable {
+@DeveloperApi
+class MultivariateGaussian (
+ val mu: Vector,
+ val sigma: Matrix) extends Serializable {
+ require(sigma.numCols == sigma.numRows, "Covariance matrix must be square")
+ require(mu.size == sigma.numCols, "Mean vector length must match covariance matrix size")
+
+ private val breezeMu = mu.toBreeze.toDenseVector
+
+ /**
+ * private[mllib] constructor
+ *
+ * @param mu The mean vector of the distribution
+ * @param sigma The covariance matrix of the distribution
+ */
+ private[mllib] def this(mu: DBV[Double], sigma: DBM[Double]) = {
+ this(Vectors.fromBreeze(mu), Matrices.fromBreeze(sigma))
+ }
+
/**
* Compute distribution dependent constants:
- * rootSigmaInv = D^(-1/2) * U, where sigma = U * D * U.t
- * u = (2*pi)^(-k/2) * det(sigma)^(-1/2)
+ * rootSigmaInv = D^(-1/2)^ * U, where sigma = U * D * U.t
+ * u = log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
*/
private val (rootSigmaInv: DBM[Double], u: Double) = calculateCovarianceConstants
/** Returns density of this multivariate Gaussian at given point, x */
- def pdf(x: DBV[Double]): Double = {
- val delta = x - mu
+ def pdf(x: Vector): Double = {
+ pdf(x.toBreeze.toDenseVector)
+ }
+
+ /** Returns the log-density of this multivariate Gaussian at given point, x */
+ def logpdf(x: Vector): Double = {
+ logpdf(x.toBreeze.toDenseVector)
+ }
+
+ /** Returns density of this multivariate Gaussian at given point, x */
+ private[mllib] def pdf(x: DBV[Double]): Double = {
+ math.exp(logpdf(x))
+ }
+
+ /** Returns the log-density of this multivariate Gaussian at given point, x */
+ private[mllib] def logpdf(x: DBV[Double]): Double = {
+ val delta = x - breezeMu
val v = rootSigmaInv * delta
- u * math.exp(v.t * v * -0.5)
+ u + v.t * v * -0.5
}
/**
* Calculate distribution dependent components used for the density function:
- * pdf(x) = (2*pi)^(-k/2) * det(sigma)^(-1/2) * exp( (-1/2) * (x-mu).t * inv(sigma) * (x-mu) )
+ * pdf(x) = (2*pi)^(-k/2)^ * det(sigma)^(-1/2)^ * exp((-1/2) * (x-mu).t * inv(sigma) * (x-mu))
* where k is length of the mean vector.
*
* We here compute distribution-fixed parts
- * (2*pi)^(-k/2) * det(sigma)^(-1/2)
+ * log((2*pi)^(-k/2)^ * det(sigma)^(-1/2)^)
* and
- * D^(-1/2) * U, where sigma = U * D * U.t
+ * D^(-1/2)^ * U, where sigma = U * D * U.t
*
* Both the determinant and the inverse can be computed from the singular value decomposition
* of sigma. Noting that covariance matrices are always symmetric and positive semi-definite,
@@ -65,11 +99,11 @@ private[mllib] class MultivariateGaussian(
*
* sigma = U * D * U.t
* inv(Sigma) = U * inv(D) * U.t
- * = (D^{-1/2} * U).t * (D^{-1/2} * U)
+ * = (D^{-1/2}^ * U).t * (D^{-1/2}^ * U)
*
* and thus
*
- * -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2} * U * (x-mu))^2
+ * -0.5 * (x-mu).t * inv(Sigma) * (x-mu) = -0.5 * norm(D^{-1/2}^ * U * (x-mu))^2^
*
* To guard against singular covariance matrices, this method computes both the
* pseudo-determinant and the pseudo-inverse (Moore-Penrose). Singular values are considered
@@ -77,21 +111,21 @@ private[mllib] class MultivariateGaussian(
* relation to the maximum singular value (same tolerance used by, e.g., Octave).
*/
private def calculateCovarianceConstants: (DBM[Double], Double) = {
- val eigSym.EigSym(d, u) = eigSym(sigma) // sigma = u * diag(d) * u.t
+ val eigSym.EigSym(d, u) = eigSym(sigma.toBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t
// For numerical stability, values are considered to be non-zero only if they exceed tol.
// This prevents any inverted value from exceeding (eps * n * max(d))^-1
val tol = MLUtils.EPSILON * max(d) * d.length
try {
- // pseudo-determinant is product of all non-zero singular values
- val pdetSigma = d.activeValuesIterator.filter(_ > tol).reduce(_ * _)
+ // log(pseudo-determinant) is sum of the logs of all non-zero singular values
+ val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum
// calculate the root-pseudo-inverse of the diagonal matrix of singular values
// by inverting the square root of all non-zero values
val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))
- (pinvS * u, math.pow(2.0 * math.Pi, -mu.length / 2.0) * math.pow(pdetSigma, -0.5))
+ (pinvS * u, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
} catch {
case uex: UnsupportedOperationException =>
throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
index d58f2587e5..fac2498e4d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/impl/MultivariateGaussianSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/distribution/MultivariateGaussianSuite.scala
@@ -15,54 +15,53 @@
* limitations under the License.
*/
-package org.apache.spark.mllib.stat.impl
+package org.apache.spark.mllib.stat.distribution
import org.scalatest.FunSuite
-import breeze.linalg.{ DenseVector => BDV, DenseMatrix => BDM }
-
+import org.apache.spark.mllib.linalg.{ Vectors, Matrices }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
class MultivariateGaussianSuite extends FunSuite with MLlibTestSparkContext {
test("univariate") {
- val x1 = new BDV(Array(0.0))
- val x2 = new BDV(Array(1.5))
+ val x1 = Vectors.dense(0.0)
+ val x2 = Vectors.dense(1.5)
- val mu = new BDV(Array(0.0))
- val sigma1 = new BDM(1, 1, Array(1.0))
+ val mu = Vectors.dense(0.0)
+ val sigma1 = Matrices.dense(1, 1, Array(1.0))
val dist1 = new MultivariateGaussian(mu, sigma1)
assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)
- val sigma2 = new BDM(1, 1, Array(4.0))
+ val sigma2 = Matrices.dense(1, 1, Array(4.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
}
test("multivariate") {
- val x1 = new BDV(Array(0.0, 0.0))
- val x2 = new BDV(Array(1.0, 1.0))
+ val x1 = Vectors.dense(0.0, 0.0)
+ val x2 = Vectors.dense(1.0, 1.0)
- val mu = new BDV(Array(0.0, 0.0))
- val sigma1 = new BDM(2, 2, Array(1.0, 0.0, 0.0, 1.0))
+ val mu = Vectors.dense(0.0, 0.0)
+ val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
val dist1 = new MultivariateGaussian(mu, sigma1)
assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)
- val sigma2 = new BDM(2, 2, Array(4.0, -1.0, -1.0, 2.0))
+ val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
val dist2 = new MultivariateGaussian(mu, sigma2)
assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
}
test("multivariate degenerate") {
- val x1 = new BDV(Array(0.0, 0.0))
- val x2 = new BDV(Array(1.0, 1.0))
+ val x1 = Vectors.dense(0.0, 0.0)
+ val x2 = Vectors.dense(1.0, 1.0)
- val mu = new BDV(Array(0.0, 0.0))
- val sigma = new BDM(2, 2, Array(1.0, 1.0, 1.0, 1.0))
+ val mu = Vectors.dense(0.0, 0.0)
+ val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
val dist = new MultivariateGaussian(mu, sigma)
assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)