Abstract
続わかりやすいパターン認識 第9章require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
require(flexmix)
## Loading required package: flexmix
require(mclust)
## Loading required package: mclust
## Package 'mclust' version 5.2
## Type 'citation("mclust")' for citing this R package in publications.
FlexMix: A General Framework for Finite Mixture Models and Latent Class Regression in R のお勉強。
data("NPreg")
m1 <- flexmix(yn ~ x + I(x^2), data = NPreg, k = 2)
ggplot(NPreg, aes(x=x, y=yn)) + geom_point()
m1 <- flexmix(yn ~ x + I(x^2), data=NPreg,k=2)
summary(m1)
##
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
##
## prior size post>0 ratio
## Comp.1 0.494 100 145 0.690
## Comp.2 0.506 100 141 0.709
##
## 'log Lik.' -642.5452 (df=9)
## AIC: 1303.09 BIC: 1332.775
読み方:the estimated prior probabilities ˆπk, the number of observations assigned to the corresponding clusters, the number of observations where \(p_{nk}\) > δ (with a default of δ = 10−4), and the ratio of the latter two numbers. \(p_{nk}\)はn番目のデータがクラスタkに割り当てられる事後確率。 ratioが1に近いほどクラスタリングがよいと言える。
require(mclust)
data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical Normal Overt
## 36 76 33
X = diabetes[,-1] # 教師ラベルを消す
head(X)
## glucose insulin sspg
## 1 80 356 124
## 2 97 289 117
## 3 105 319 143
## 4 90 356 199
## 5 90 323 240
## 6 86 381 157
BIC = mclustBIC(X)
plot(BIC)
EIIなどは要素の形状を表している。 “EII”: 球型、同体積
“VII”: 球型、異なる体積
“EEE”: 楕円球型、同体積・形・向き
“VVV”: 楕円球型、異なる体積・形・向き
等を表す。
BICが最大なのはVVVのk=3のモデルである。
mod1 = Mclust(X, x = BIC)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -2307.883 145 29 -4760.091 -4776.086
##
## Clustering table:
## 1 2 3
## 82 33 30
##
## Mixing probabilities:
## 1 2 3
## 0.5603211 0.2244432 0.2152356
##
## Means:
## [,1] [,2] [,3]
## glucose 91.39558 105.1109 219.21971
## insulin 358.61206 516.2814 1040.59177
## sspg 166.02012 320.2471 98.56807
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 61.81664 97.41582 34.42346
## insulin 97.41582 2106.98136 378.95467
## sspg 34.42346 378.95467 2669.14406
## [,,2]
## glucose insulin sspg
## glucose 152.2496 789.1576 -483.0501
## insulin 789.1576 6476.1400 -2752.2840
## sspg -483.0501 -2752.2840 26029.0307
## [,,3]
## glucose insulin sspg
## glucose 6350.858 26190.11 -4448.25
## insulin 26190.111 122126.21 -22772.10
## sspg -4448.250 -22772.10 5913.76
plot(mod1, "classification")
gmmgen <- function(n, mu1, mu2, sigma1, sigma2, pi1) {
data1 <- rnorm(n, mean=mu1, sd=sigma1)
data2 <- rnorm(n, mean=mu2, sd=sigma2)
data3 <- (runif(n) <= pi1)
dat <- data1*data3 + data2*(1-data3)
list(data=dat, class=(data3+1))
}
d <- gmmgen(n=500, mu1=3, mu2=-1, sigma1=1, sigma2=1, pi1=0.6)
gmmdata <- d$data
class <- d$class
hist(gmmdata)
m2 <- Mclust(gmmdata)
summary(m2)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust E (univariate, equal variance) model with 2 components:
##
## log.likelihood n df BIC ICL
## -1031.982 500 4 -2088.822 -2117.648
##
## Clustering table:
## 1 2
## 222 278
plot(m2, "density", xlim=c(-4,6))
par(new=T)
hist(gmmdata, xlim=c(-4,6))
table(class, m2$classification)
##
## class 1 2
## 1 213 3
## 2 9 275
n <- 1000
x <- sapply(1:n, function(i) if (runif(1) > 0.6) mvrnorm(1, c(-1,1), matrix(c(1, 0.5, 0.5, 1), 2)) else mvrnorm(1, c(1,-1), matrix(c(1, 0.7, 0.7, 1), 2)))
X <- t(x)
plot(X)
m3 <- Mclust(X)
summary(m3)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVE (ellipsoidal, equal orientation) model with 2 components:
##
## log.likelihood n df BIC ICL
## -3202.104 1000 10 -6473.285 -6513.4
##
## Clustering table:
## 1 2
## 597 403
plot(m3, "density", xlim=c(-5,5), ylim=c(-5,5))
par(new=T)
plot(X, xlim=c(-5,5), ylim=c(-5,5))
識別境界を書くにはcontour()を使えばよい。
xgrid <- seq(-5,5,0.01)
ygrid <- seq(-5,5,0.01)
grid <- expand.grid(xgrid, ygrid)
pred <- predict(m3, grid)
contour(xgrid,ygrid,array(pred$classification,dim=c(length(xgrid),length(ygrid))), xlim=c(-5,5), ylim=c(-5,5), drawlabels=F, col = "blue")
par(new=T)
plot(X, xlim=c(-5,5), ylim=c(-5,5))