0. Preliminary

library(MASS)
library(mixtools)
## mixtools package, version 1.2.0, Released 2020-02-05
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
library(mvtnorm)
## 
## Attaching package: 'mvtnorm'
## The following objects are masked from 'package:mixtools':
## 
##     dmvnorm, rmvnorm

1. When we have the information of normal behavior

Generate Normal data from Multivariate Normal distribution

\(x.1 \sim \mathcal{N}((0,0),I)\\\)

\(x.2 \sim \mathcal{N}((3,4),I)\)

set.seed(100)
x.1 <- rmvnorm(200, c(0, 0))
x.2 <- rmvnorm(230, c(3, 4))
X.1 <- rbind(x.1, x.2)
plot(X.1)
points(x.1,col=2)
points(x.2,col=3)

Fit Gaussian mixture model with EM-algoritm

fit = mvnormalmixEM(X.1,k=2,maxit = 100)
## number of iterations= 18
lamb_hat = fit$lambda
mu_hat = fit$mu
sigma_hat = fit$sigma

Estimated value of the paramters

summary(fit)
## summary of mvnormalmixEM object:
##           comp 1   comp 2
## lambda  0.476555 0.523445
## mu1    -0.128682 0.169487
## mu2     3.207240 3.982195
## loglik at estimate:  -1508.861
fit$sigma
## [[1]]
##            [,1]       [,2]
## [1,]  0.8148672 -0.0378889
## [2,] -0.0378889  1.1770322
## 
## [[2]]
##            [,1]       [,2]
## [1,]  0.9610726 -0.1058652
## [2,] -0.1058652  1.0595212

Some visualization

plot(fit,whichplots = 1)

plot(fit,whichplots = 2)

Generate unseen (test) data with Outliers

\(ts.1 \sim \mathcal{N}((0,0),I)\)

\(ts.2 \sim \mathcal{N}((3,4),I)\)

\(out.1 \sim \mathcal{N}((-4,7),I)\)

ts.1 = rmvnorm(50, c(0,0))
ts.2 = rmvnorm(60, c(3,4))
out.1 = rmvnorm(5, c(-4,7))

ts.df = rbind(ts.1,ts.2,out.1)

plot(ts.df)
points(ts.1,col=2)
points(ts.2,col=3)
points(out.1,col=4)

Calculate fit probabilities of test data with estimated parameters

fit probability :

\(\sum_{k=\text{1}}^{2}p(\mathbf{z}^{k}_{i}=1,\mathbf{y}_{i}|\hat{\theta}) = \sum_{k=1}^{2}p(\mathbf{y}_{i}|\mathbf{z}^{k}_{i}=\text{1},\hat{\theta})\;p(\mathbf{z}^{k}_{i}=\text{1}|\hat{\theta})\)

norm.den.1 = dmvnorm(ts.df, mean=mu_hat[[1]],sigma=sigma_hat[[1]])
pi.1 = lamb_hat[1]

norm.den.2 = dmvnorm(ts.df, mean=mu_hat[[2]],sigma=sigma_hat[[2]])
pi.2 = lamb_hat[2]

joint.1 = norm.den.1 * pi.1 
joint.2 = norm.den.2 * pi.2 


post.1 = joint.1 / (joint.1 + joint.2)
post.2 = joint.2 / (joint.1 + joint.2)

Visualize posterior probabilities of test data

x = ts.df
post.df = cbind(post.1,post.2)

new.plot.gmm = function(x,post.df,lambda,mu,sigma, xlab2=NULL,ylab2=NULL,main2 = NULL, col2 = NULL, lwd2 = 2, 
         alpha = 0.05, marginal = FALSE){
  if (ncol(x) != 2) {
    stop("The data must have 2 columns!")
  }
  post = apply(post.df, 1, which.max)
  k <- ncol(post.df)
  
  if (is.null(xlab2)) {
    xlab2 <- "X.1"
  }
  if (is.null(ylab2)) {
    ylab2 <- "X.2"
  }
  if (is.null(col2)) {
    col2 <- 2:(k + 1)
  }
  if (marginal == FALSE) {
    if (is.null(main2)) {
      main2 <- "Density Curves"
    }
    plot(x, col = col2[post], xlab = xlab2, ylab = ylab2, 
         main = main2)
    lapply(1:k, function(i) points(mu[[i]][1], mu[[i]][2], 
                                   pch = 19))
    for (i in 1:k) {
      for (j in 1:length(alpha)) {
        mixtools::ellipse(mu = mu[[i]], sigma = sigma[[i]], 
                alpha = alpha[j], col = col2[i])
      }
    }
  }
}


new.plot.gmm(ts.df,post.df,lambda=lamb_hat,mu=mu_hat,sigma=sigma_hat)

Find low fit probabilities

joint.df = data.frame(x=ts.df,joint=joint.1+joint.2)
head(joint.df[order(joint.df$joint),],5)
##           x.1      x.2        joint
## 112 -3.864747 7.664568 9.296272e-15
## 111 -4.973718 5.876956 1.205068e-13
## 113 -1.871619 8.601926 4.727971e-11
## 114 -2.798201 6.875329 6.445182e-11
## 115 -2.982720 6.536155 6.887655e-11
head.joint.df = head(joint.df[order(joint.df$joint),],5)
plot(ts.df)
points(head.joint.df[,c(1,2)],col=4)

2. When there is no information of normal behavior

Scatter plot

plot(ts.df)

Fit EM-algorithm to the full data

fit = mvnormalmixEM(ts.df,k=2,maxit = 100)
## number of iterations= 11
summary(fit)
## summary of mvnormalmixEM object:
##           comp 1   comp 2
## lambda  0.501492 0.498508
## mu1    -0.392771 0.742237
## mu2     2.811644 4.180787
## loglik at estimate:  -431.7735
lamb_hat = fit$lambda
mu_hat = fit$mu
sigma_hat = fit$sigma

fit$sigma
## [[1]]
##           [,1]      [,2]
## [1,]  1.527146 -1.569154
## [2,] -1.569154  5.287890
## 
## [[2]]
##            [,1]       [,2]
## [1,] 0.64617581 0.01251621
## [2,] 0.01251621 0.71033430
plot(fit,whichplots = 1)

plot(fit,whichplots = 2)

Calculate fit (joint) probabilities for all observations

fit probability :

\(\sum_{k=\text{1}}^{2}p(\mathbf{z}^{k}_{i}=1,\mathbf{y}_{i}|\hat{\theta}) = \sum_{k=1}^{2}p(\mathbf{y}_{i}|\mathbf{z}^{k}_{i}=\text{1},\hat{\theta})\;p(\mathbf{z}^{k}_{i}=\text{1}|\hat{\theta})\)

norm.den.1 = dmvnorm(ts.df, mean=mu_hat[[1]],sigma=sigma_hat[[1]])
pi.1 = lamb_hat[1]

norm.den.2 = dmvnorm(ts.df, mean=mu_hat[[2]],sigma=sigma_hat[[2]])
pi.2 = lamb_hat[2]

joint.1 = norm.den.1 * pi.1 
joint.2 = norm.den.2 * pi.2 


post.1 = joint.1 / (joint.1 + joint.2)
post.2 = joint.2 / (joint.1 + joint.2)

Show 5 observations which have smallest fit (joint) probabilities

joint.df = data.frame(x=ts.df,joint=joint.1+joint.2)
head(joint.df[order(joint.df$joint),],5)
##           x.1      x.2        joint
## 111 -4.973718 5.876956 3.410138e-05
## 113 -1.871619 8.601926 6.945341e-05
## 112 -3.864747 7.664568 1.407629e-04
## 114 -2.798201 6.875329 8.179183e-04
## 115 -2.982720 6.536155 9.860292e-04
head.joint.df = head(joint.df[order(joint.df$joint),],5)
plot(ts.df)
points(head.joint.df[,c(1,2)],col=4)