require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
require(flexmix)
## Loading required package: flexmix
require(mclust)
## Loading required package: mclust
## Package 'mclust' version 5.2
## Type 'citation("mclust")' for citing this R package in publications.

1 flexmix

FlexMix: A General Framework for Finite Mixture Models and Latent Class Regression in R のお勉強。

data("NPreg")
m1 <- flexmix(yn ~ x + I(x^2), data = NPreg, k = 2)
ggplot(NPreg, aes(x=x, y=yn)) + geom_point()

m1 <- flexmix(yn ~ x + I(x^2), data=NPreg,k=2)
summary(m1)
## 
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
## 
##        prior size post>0 ratio
## Comp.1 0.494  100    145 0.690
## Comp.2 0.506  100    141 0.709
## 
## 'log Lik.' -642.5452 (df=9)
## AIC: 1303.09   BIC: 1332.775

読み方:the estimated prior probabilities ˆπk, the number of observations assigned to the corresponding clusters, the number of observations where \(p_{nk}\) > δ (with a default of δ = 10−4), and the ratio of the latter two numbers. \(p_{nk}\)はn番目のデータがクラスタkに割り当てられる事後確率。 ratioが1に近いほどクラスタリングがよいと言える。

2 mclust

require(mclust)
data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical   Normal    Overt 
##       36       76       33
X = diabetes[,-1] # 教師ラベルを消す
head(X)
##   glucose insulin sspg
## 1      80     356  124
## 2      97     289  117
## 3     105     319  143
## 4      90     356  199
## 5      90     323  240
## 6      86     381  157
BIC = mclustBIC(X)
plot(BIC)

EIIなどは要素の形状を表している。  “EII”: 球型、同体積
 “VII”: 球型、異なる体積
 “EEE”: 楕円球型、同体積・形・向き
 “VVV”: 楕円球型、異なる体積・形・向き
 等を表す。
 BICが最大なのはVVVのk=3のモデルである。  

mod1 = Mclust(X, x = BIC)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -2307.883 145 29 -4760.091 -4776.086
## 
## Clustering table:
##  1  2  3 
## 82 33 30 
## 
## Mixing probabilities:
##         1         2         3 
## 0.5603211 0.2244432 0.2152356 
## 
## Means:
##              [,1]     [,2]       [,3]
## glucose  91.39558 105.1109  219.21971
## insulin 358.61206 516.2814 1040.59177
## sspg    166.02012 320.2471   98.56807
## 
## Variances:
## [,,1]
##          glucose    insulin       sspg
## glucose 61.81664   97.41582   34.42346
## insulin 97.41582 2106.98136  378.95467
## sspg    34.42346  378.95467 2669.14406
## [,,2]
##           glucose    insulin       sspg
## glucose  152.2496   789.1576  -483.0501
## insulin  789.1576  6476.1400 -2752.2840
## sspg    -483.0501 -2752.2840 26029.0307
## [,,3]
##           glucose   insulin      sspg
## glucose  6350.858  26190.11  -4448.25
## insulin 26190.111 122126.21 -22772.10
## sspg    -4448.250 -22772.10   5913.76
plot(mod1, "classification")

3 一次元正規分布

gmmgen <- function(n, mu1, mu2, sigma1, sigma2, pi1) {
   data1 <- rnorm(n, mean=mu1, sd=sigma1)
   data2 <- rnorm(n, mean=mu2, sd=sigma2)
   data3 <- (runif(n) <= pi1) 
   dat <- data1*data3 + data2*(1-data3) 
   list(data=dat, class=(data3+1))
}
d <- gmmgen(n=500, mu1=3, mu2=-1, sigma1=1, sigma2=1, pi1=0.6)
gmmdata <- d$data
class <- d$class
hist(gmmdata)

m2 <- Mclust(gmmdata)
summary(m2)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust E (univariate, equal variance) model with 2 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -1031.982 500  4 -2088.822 -2117.648
## 
## Clustering table:
##   1   2 
## 222 278
plot(m2, "density", xlim=c(-4,6))
par(new=T)
hist(gmmdata, xlim=c(-4,6))

table(class, m2$classification)
##      
## class   1   2
##     1 213   3
##     2   9 275

4 二次元

n <- 1000
x <- sapply(1:n, function(i) if (runif(1) > 0.6) mvrnorm(1, c(-1,1), matrix(c(1, 0.5, 0.5, 1), 2)) else mvrnorm(1, c(1,-1), matrix(c(1, 0.7, 0.7, 1), 2)))
X <- t(x)
plot(X)

m3 <- Mclust(X)
summary(m3)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 2 components:
## 
##  log.likelihood    n df       BIC     ICL
##       -3202.104 1000 10 -6473.285 -6513.4
## 
## Clustering table:
##   1   2 
## 597 403
plot(m3, "density", xlim=c(-5,5), ylim=c(-5,5))
par(new=T)
plot(X, xlim=c(-5,5), ylim=c(-5,5))

識別境界を書くにはcontour()を使えばよい。

xgrid <- seq(-5,5,0.01)
ygrid <- seq(-5,5,0.01)
grid <- expand.grid(xgrid, ygrid)
pred <- predict(m3, grid)
contour(xgrid,ygrid,array(pred$classification,dim=c(length(xgrid),length(ygrid))), xlim=c(-5,5), ylim=c(-5,5), drawlabels=F, col = "blue")
par(new=T)
plot(X, xlim=c(-5,5), ylim=c(-5,5))