Abstract
続わかりやすいパターン認識 第3章箱の中に外見上全く区別の付かない三種類のコイン\(\omega_1, \omega_2, \omega_3\)が大量にある。その含有率を0.1, 0.4, 0.5とする。それぞれのコインを投げて表が出る確率を0.8, 0.6, 0.3とする。 コインを一枚箱から取り出し、 続けて\(n\)回投げて、\(r\)回表が出たときコインの種類を推定せよ。
MAP推定で識別することをベイズ決定則という。つまり \[ \hat{\omega_i} = \mathrm{argmax}_k P(\omega_k | \vec{x}^{(n)}) = \mathrm{argmax}_k P(\omega_k , \vec{x}^{(n)}) \] が識別則である。
\[ P(\omega_k , \vec{x}^{(n)}) = P(\vec{x}^{(n)} | \omega_k) P(\omega_k) = {}_nC_r\theta_{k}^r(1-\theta_k)^{n-r} \pi_k \propto \theta_{k}^r(1-\theta_k)^{n-r} \pi_k \] ここで\(\theta_k\)は表が出る確率、\(\pi_k\)はコインの含有率である。
require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
decitionFunc <- function(n, r, thetak, pik){
thetak ^ r * (1 - thetak) ^ (n - r) * pik
}
plotPosterior <- function(n, thetas, pis){
rs <- 1:n
rlog <- c()
glog <- c()
coinlog <- c()
for (r in rs) {
for (i_coin in 1:3) {
g <- decitionFunc(n, r, thetas[i_coin], pis[i_coin])
glog <- c(glog, g)
rlog <- c(rlog, r)
coinlog <- c(coinlog, i_coin)
}
}
coinlog <- as.factor(coinlog)
df <- data.frame(r = rlog, g = glog, coin=coinlog)
dfSum <- group_by(df, r) %>% summarise(sumg=sum(g))
dfPost <- left_join(df, dfSum, by="r")
dfPost <- mutate(dfPost, posterior=g/sumg)
ggplot(dfPost, aes(x=r, y=posterior, group=coin, colour=coin)) + geom_line()
}
thetas <- c(0.8, 0.6, 0.3)
pis <- c(0.1, 0.4, 0.5)
n <- 10
plotPosterior(n = n, thetas = thetas, pis =pis)
plotPosterior(100, thetas, pis)
calceB <- function(n, thetas, pis){
rs <- 1:n
rlog <- c()
glog <- c()
coinlog <- c()
for (r in rs) {
for (i_coin in 1:3) {
g <- decitionFunc(n, r, thetas[i_coin], pis[i_coin])
glog <- c(glog, g)
rlog <- c(rlog, r)
coinlog <- c(coinlog, i_coin)
}
}
coinlog <- as.factor(coinlog)
df <- data.frame(r = rlog, g = glog, coin=coinlog)
dfSum <- group_by(df, r) %>% summarise(sumg=sum(g))
dfPost <- left_join(df, dfSum, by="r")
dfPost <- mutate(dfPost, posterior=g/sumg)
dfCBE <- group_by(dfPost, r) %>%
dplyr::summarise( maxpost=max(posterior)) %>%
mutate(eB=1-maxpost)
prior <- c()
likelihood <- c()
rlog <- c()
for (r in 1:n){
for (i in 1:3){
prior <- c(prior, pis[i])
likelihood <- c(likelihood, choose(n, r) * thetas[i]^r * (1- thetas[i])^(n-r) )
rlog <- c(rlog, r)
}
}
df <- data.frame(r=rlog, prior=prior, likelihood=likelihood)
dfR <- df %>%
mutate(promega=prior * likelihood) %>%
group_by(r) %>%
summarise(pr=sum(promega))
dfeb <- inner_join(dfCBE, dfR, by="r") %>%
mutate(ebr=eB * pr)
return(sum(dfeb$ebr))
}
ebs <- c()
ns <- seq(0,100,10)
for (n in ns) {
eb <- calceB(n = n,thetas = thetas, pis = pis)
ebs <- c(ebs, eb)
}
ggplot(data.frame(n=ns, BayesError=ebs), aes(x=n, y=BayesError)) + geom_line()
require(ROCR)
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
score <- c(0.9,0.8,0.7,0.55,0.45,0.4,0.34,0.3,0.2,0.1)
class <- c(1,1,0,1,1,0,0,1,0,0)
pred <- prediction(score, class)
perf <- performance(pred, 'tpr', 'fpr')
plot(perf)