Marie Mørch
The Gaussian mixture model is given by the density \[ p \phi(x; \mu_1, \sigma_1)+ (1 − p)\phi (x; \mu_2, \sigma_2) \] for parameters \(p \in (0, 1)\), \(\mu_1\), \(\mu_2 \in \mathbb{R}\) and \(\sigma_1, \sigma_2 > 0\).
Here \(\phi(·; \mu, \sigma)\) denotes the density for the Gaussian distribution with mean μ and standard deviation σ.
This is the distribution of \[ X = ZY_1 + (1 − Z)Y_2 \] where \(Y_1,Y_2,Z\) are independent,\(P(Z=1)=1−P(Z=0)=p\), and \(Y_i\) is \(N(μ_i,σ_i)\) distributed.
Log-likelihood: \(l(\theta \vert x) = \sum_{i=1}^N \log( p \phi(x_i; \mu_1, \sigma_1))+(1 − p)\phi (x_i; \mu_2, \sigma_2))\).
In stead of working directly with the likelihood it can be easier to work with the densities \(Y\vert \theta\) and \(Z \vert (x, \theta)\) where \(Y=(X,Z)\) is the complete data.
Aim: Implement the EM-algorithm for the Gaussian mixture model
Iteratively seeks to maximize \(L(\theta \vert x)\) w.r.t. \(\theta\).
\(Q(\theta \vert \theta^{(t)} )\) is the expectation of the joint log-likelihood for the complete data given the observed data.
Each maximization step will increase the observed likelihood.
E-step: Compute \(\gamma_i = P(Z_i=1 \vert x, \theta^{(t)})= \frac{ p \phi(x_i; \mu_1, \sigma_1) }{ p \phi(x_i; \mu_1, \sigma_1) +(1 − p)\phi (x_i; \mu_2, \sigma_2)}\)
M-step: \[ \begin{aligned} \hat{\mu}_1 & = \frac{ \sum_{i=1}^N(\gamma_i\cdot x_i)}{\sum_{i=1}^N(\gamma_i)} \\ \hat{\mu}_2 & = \frac{ \sum_{i=1}^N((1-\gamma_i) \cdot x_i)}{\sum_{i=1}^N(1-\gamma_i)}\\ \hat{\sigma}_1^2 & = \frac{ \sum_{i=1}^N(\gamma_i \cdot (x_i-\hat{\mu}_1)}{\sum_{i=1}^N(\gamma_i)}\\ \hat{\sigma}_2^2 & = \frac{ \sum_{i=1}^N((1-\gamma_i) \cdot (x_i-\hat{\mu}_2)}{\sum_{i=1}^N(1-\gamma_i)}\\ \hat{p} & = \sum_{i=1}^N \frac{\gamma_i}{N} \end{aligned} \]
# par = (mean1, sd1 , mean2, sd2, prob)
Estep <- function(par,x){
T1 <- dnorm(x, par[1], par[2])
T2 <- dnorm(x, par[3], par[4])
(par[5]*T1/(par[5]*T1+(1-par[5])*T2))
}
Mstep<- function(par, gamma,x){
par[1] <- sum(gamma*x)/sum(gamma)
par[2] <- sqrt(sum((gamma)*(x-par[1])^2)/sum(gamma))
par[3] <- sum((1-gamma)*x)/sum(1-gamma)
par[4] <- sqrt(sum((1-gamma)*(x-par[3])^2)/sum(1-gamma))
par[5] <- mean(gamma)
par
}
EM <- function(par, x, tmax = 200) {
for(i in 1:tmax)
par <- Mstep(par,Estep(par, x),x)
par
}We would like to model the density of the data points, and due to the apparent bi-modality a Gaussian mixture model seems reasonably.
myPar <- c(-0.5,1, 2.5,1, 0.5)
phat<-EM(myPar, data)
phat## [1] -0.5626630 0.7139917 2.4063524 0.3637633 0.5737205
logLik <- function(par, x){
if (par[2] <= 0 || par[4] <= 0 || par[5] > 1 || par[5] < 0){
return(Inf)
}
T1 <- dnorm(x, par[1], par[2])
T2 <- dnorm(x, par[3], par[4])
-sum(log(par[5]*T1+(1-par[5])*T2))
}
optim(c(-0.5,0.7,3,0.5,0.5),logLik, x=data)$par## [1] -0.5627575 0.7140097 2.4063581 0.3638136 0.5737727
set.seed(3005)
simdata <- c(rnorm(600, -2.5,1), rnorm(400, 3.5, 0.6))
par <- c(-3,1, 3,1, 0.5)
p<-EM(par, simdata)
p## [1] -2.5067016 1.0050496 3.5067559 0.6023329 0.6000529
#Convergence criteria
relCon <- function(par, parprev){
sqrt(sum((par-parprev)^2))/sqrt(sum(parprev^2))
}
#Constructor
mix <- function(mu1, sigma1, mu2, sigma2, p, x, it, conHis, loglik){
structure(list(par=c(mu1, sigma1,mu2,sigma2, p), it=it, conHis=conHis, x=x, loglik=loglik),
class = "mixture")
}
EM <- function(par, x, maxit = 200, tol=10e-5) {
con <- matrix(0,nrow=maxit,ncol= 5 )
loglik <- c()
parprev <- c(0,0,0,0,0)
it<- 0
while(relCon(par, parprev)>=tol && it<maxit) {
parprev <- par
par <- Mstep(par,Estep(par, x),x)
it <- it+1
con[it,] <- par
loglik[it] <- logLik(par, x)
}
mix(par[1], par[2], par[3], par[4], par[5], x, it, conHis=con[1:it,], loglik=loglik)
}gm <- EM(myPar,x= data)
gm$par## [1] -0.5627194 0.7139202 2.4063176 0.3638038 0.5737046
phat## [1] -0.5626630 0.7139917 2.4063524 0.3637633 0.5737205
\(\Psi\) denotes the EM mapping: \(\Psi(\theta')= arg \max Q(\theta \vert \theta')\)
Observed fisher information: \[ \hat{i}_X := -D_{\theta}^2 l(\hat{\theta}) = [I-\hat{i}_{Z\vert X}(\hat{\theta}) \hat{i}_Y(\hat{\theta})^{-1}] ]\hat{i}_Y(\hat{\theta}) = [I-\Psi'(\hat{\theta})^T ]\hat{i}_Y(\hat{\theta}) \]
library(numDeriv)
Qfun <- function(par, x){
gamma <- Estep(par, x)
T1 <- dnorm(x, par[1], par[2])
T2 <- dnorm(x, par[3], par[4])
sum(gamma*log(T1)+(1-gamma)*log(T2)+gamma*log(par[5])+(1-gamma)*log(1-par[5]))
}
se <- function(x) UseMethod("se")
se.mixture <- function(gm){
Psi <- function(pp) Mstep(pp,Estep(pp, x=gm$x),x=gm$x)
Q <- function(par) Qfun(par, x=gm$x)
iY <- -hessian(Q, gm$par)
DPsi <- jacobian(Psi, gm$par)
iX <- (diag(1, 5) - t(DPsi)) %*% iY
OI<-solve(iX)
sqrt(diag(OI))
}We can compute the observed Fisher information using optim:
iHat <- optim(c(-0.5,0.7,3,0.5,0.5),logLik, x=data, hessian=TRUE)$hessian
#Standard errors:
sqrt(diag(solve(iHat)))## [1] 0.04752175 0.03596568 0.02816109 0.02113701 0.02437942
Comparison with the SEM-algorithm:
gm <- EM(myPar,x= data)
se(gm)## [1] 0.04822583 0.03749820 0.02884105 0.02239508 0.02447708
plot.mixture <- function(gm,...){
mi<- min(gm$x)-1
ma <- max(gm$x)+1
hist(gm$x,freq=F, ...)
curve(gm$par[5]*dnorm(x, gm$par[1],gm$par[2])+(1-gm$par[5])*dnorm(x, gm$par[3],gm$par[4]), mi, ma, add=TRUE)
}
print.mixture<- function(gm){
seval <- se(gm)
cat("Par \t Est \t SE \n")
cat(paste("mu1 \t", round(gm$par[1],5), round(seval[1],5), "\n"))
cat(paste("sigma1 \t", round(gm$par[2],5), round(seval[2],5), "\n"))
cat(paste("mu2 \t", round(gm$par[3],5), round(seval[3],5), "\n"))
cat(paste("sigma2 \t", round(gm$par[4],5), round(seval[4],5), "\n"))
cat(paste("p \t ", round(gm$par[5],5), round(seval[5],5), "\n"))
}
convergenceplot <- function(x) UseMethod("convergenceplot")
convergenceplot.mixture <- function(gm){
par(mfrow=c(2,3))
plot(1:gm$it, gm$conHis[,1], type="l", main = expression(mu[1]), xlab="Iterations", ylab="Parameter")
plot(1:gm$it, gm$conHis[,2], type="l", main = expression(sigma[1]), xlab="Iterations", ylab="Parameter")
plot(1:gm$it, gm$conHis[,3], type="l", main = expression(mu[2]), xlab="Iterations", ylab="Parameter")
plot(1:gm$it, gm$conHis[,4], type="l", main = expression(sigma[2]), xlab="Iterations", ylab="Parameter")
plot(1:gm$it, gm$conHis[,5], type="l", main = "p", xlab="Iterations", ylab="Parameter")
plot(1:gm$it, gm$loglik, type="l", main = "Log-likelihood", xlab="Iterations", ylab="Parameter")
}## Par Est SE
## mu1 -0.56272 0.04752
## sigma1 0.71392 0.03597
## mu2 2.40632 0.02816
## sigma2 0.3638 0.02114
## p 0.5737 0.02438
set.seed(1234)
simdata <- c(rnorm(600, -2.5,1), rnorm(400, 3.5, 0.6))
p <- c(-2.5, 1, 3.5,0.6, 0.6)
#Close to optimum
par1 <- c(-3,1, 3,1, 0.5)
gm1 <- EM(par1,x=simdata)
gm1$par## [1] -2.5210611 1.0181101 3.4793744 0.5786907 0.6000298
gm1$it## [1] 4
sum((gm1$par-p)^2) ## [1] 0.001651049
#Far away from uptimum
par2 <- c(-20,10, -21, 12, 0.9)
gm2 <- EM(par2,x=simdata)
gm2$par## [1] -2.5210609 1.0181105 3.4793745 0.5786905 0.6000298
gm2$it## [1] 82
sum((gm2$par-p)^2) ## [1] 0.001651057
#Same value of mean parameter
par3 <- c(2,1, 2,1, 0.5)
gm3 <- EM(par3,x=simdata)
gm3$par## [1] -0.1210656 3.0654433 -0.1210656 3.0654433 0.5000000
gm3$it## [1] 2
sum((gm3$par-p)^2) ## [1] 29.12591
intPar <- function(x){
m1 <- sample(x, 1)
m2 <- sample(x, 1)
s1 <- s2 <- sd(x)
c(m1, s1, m2,s2,0.5)
}
autopar <- intPar(simdata)
gm <- EM(autopar,x=simdata)
gm$par## [1] 3.4793745 0.5786906 -2.5210610 1.0181103 0.3999702
gm$it## [1] 26
set.seed(1234)
simdata <- c(rnorm(600, -2.5,1), rnorm(400, 3.5, 0.6))
p <- c(-2.5, 1, 3.5,0.6, 0.6)
par <-c(-2, 1, 3,1, 0.5)set.seed(12345)
simdata1 <- c(rnorm(500, -2.5,1), rnorm(500, 4.5, 0.6))
p <- c(-2.5, 1, 4.5,0.6, 0.5)
hist(simdata1)par <- c(3, 1, 4,1, 0.5)
gm1 <- EM(par,x=simdata1)
gm1$par## [1] -2.4175387 0.9891595 4.5059615 0.6035696 0.5000000
gm1$it## [1] 6
sum((gm1$par-p)^2) ## [1] 0.006965658
set.seed(12345)
simdata2 <- c(rnorm(400, 3, 0.5), rnorm(600, 2,1))
hist(simdata2)p <- c(3, 0.5, 2,1, 0.4)
par <- c(3.5, 0.5, 2.5,1.5, 0.5)
gm2 <- EM(par,x=simdata2)
gm2$par## [1] 3.0404325 0.5062688 2.0318703 1.0012274 0.3880307
gm2$it## [1] 62
sum((gm2$par-p)^2) ## [1] 0.00283457
set.seed(12345)
simdata3 <- c(rnorm(50, -2, 0.5), rnorm(950, 2,1))
p <- c(2, 1, -2,0.5, 0.95)
hist(simdata3)par <- c(2.5, 1, -2.5,1, 0.5)
gm3 <- EM(par,x=simdata3)
gm3$par## [1] 2.0361620 0.9983877 -1.9069745 0.5642055 0.9506857
gm3$it## [1] 22
sum((gm3$par-p)^2)## [1] 0.01408684