箱の中に外見上全く区別の付かない三種類のコイン\(\omega_1, \omega_2, \omega_3\)が大量にある。その含有率を0.1, 0.4, 0.5とする。それぞれのコインを投げて表が出る確率を0.8, 0.6, 0.3とする。 コインを一枚箱から取り出し、 続けて\(n\)回投げて、\(r\)回表が出たときコインの種類を推定せよ。

MAP推定で識別することをベイズ決定則という。つまり \[ \hat{\omega_i} = \mathrm{argmax}_k P(\omega_k | \vec{x}^{(n)}) = \mathrm{argmax}_k P(\omega_k , \vec{x}^{(n)}) \] が識別則である。

\[ P(\omega_k , \vec{x}^{(n)}) = P(\vec{x}^{(n)} | \omega_k) P(\omega_k) = {}_nC_r\theta_{k}^r(1-\theta_k)^{n-r} \pi_k \propto \theta_{k}^r(1-\theta_k)^{n-r} \pi_k \] ここで\(\theta_k\)は表が出る確率、\(\pi_k\)はコインの含有率である。

require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)

1 事後確率

decitionFunc <- function(n, r, thetak, pik){
  thetak ^ r * (1 - thetak) ^ (n - r) * pik 
}

plotPosterior <- function(n, thetas, pis){
  rs <- 1:n
  rlog <- c()
  glog <- c()
  coinlog <- c()
  for (r in rs) {
    for (i_coin in 1:3) {
      g <- decitionFunc(n, r, thetas[i_coin], pis[i_coin])
      glog <- c(glog, g)
      rlog <- c(rlog, r)
      coinlog <- c(coinlog, i_coin)
    }
  }
  coinlog <- as.factor(coinlog) 
  df <- data.frame(r = rlog, g = glog, coin=coinlog)
  dfSum <- group_by(df, r) %>% summarise(sumg=sum(g))
  dfPost <- left_join(df, dfSum, by="r")
  dfPost <- mutate(dfPost, posterior=g/sumg)
  ggplot(dfPost, aes(x=r, y=posterior, group=coin, colour=coin)) + geom_line()
}

thetas <- c(0.8, 0.6, 0.3)
pis <-  c(0.1, 0.4, 0.5)
n <- 10
plotPosterior(n = n, thetas = thetas, pis =pis)

plotPosterior(100, thetas, pis)

2 ベイズ誤り確率

calceB <- function(n, thetas, pis){
  rs <- 1:n
  rlog <- c()
  glog <- c()
  coinlog <- c()
  for (r in rs) {
    for (i_coin in 1:3) {
      g <- decitionFunc(n, r, thetas[i_coin], pis[i_coin])
      glog <- c(glog, g)
      rlog <- c(rlog, r)
      coinlog <- c(coinlog, i_coin)
    }
  }
  coinlog <- as.factor(coinlog) 
  df <- data.frame(r = rlog, g = glog, coin=coinlog)
  dfSum <- group_by(df, r) %>% summarise(sumg=sum(g))
  dfPost <- left_join(df, dfSum, by="r")
  dfPost <- mutate(dfPost, posterior=g/sumg)
  dfCBE <-  group_by(dfPost, r) %>%
              dplyr::summarise( maxpost=max(posterior)) %>%
              mutate(eB=1-maxpost)
  prior <- c()
  likelihood <- c()
  rlog <- c()
  for (r in 1:n){
    for (i in 1:3){
      prior <- c(prior, pis[i])
      likelihood <- c(likelihood, choose(n, r) * thetas[i]^r * (1- thetas[i])^(n-r) )
      rlog <- c(rlog, r)
    }
  }
  df <- data.frame(r=rlog, prior=prior, likelihood=likelihood) 
  dfR <- df %>%
    mutate(promega=prior * likelihood) %>%
    group_by(r) %>% 
    summarise(pr=sum(promega))
  dfeb <- inner_join(dfCBE, dfR, by="r") %>%
    mutate(ebr=eB * pr)
  return(sum(dfeb$ebr))
}

ebs <- c()
ns <- seq(0,100,10)
for (n in ns) {
  eb <- calceB(n = n,thetas = thetas, pis = pis)
  ebs <- c(ebs, eb)
}
ggplot(data.frame(n=ns, BayesError=ebs), aes(x=n, y=BayesError)) + geom_line()

3 ROC

require(ROCR)
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
score <- c(0.9,0.8,0.7,0.55,0.45,0.4,0.34,0.3,0.2,0.1)
class <- c(1,1,0,1,1,0,0,1,0,0)
pred <- prediction(score, class)
perf <- performance(pred, 'tpr', 'fpr')
plot(perf)