階層ベイズモデル サンプル1

library(ggplot2)
library(reshape2)
library(scales)
library(xtable)
library(bayesm)

打率データを作成

以下の資料のデータを参考にサンプルデータを生成する
http://eprints.lib.hokudai.ac.jp/dspace/bitstream/2115/39717/2/kubo2009IEICE.pdf

サマリデータの読み込み

batting_average_data <- read.csv("batting_average.csv", 
    header = T)
batting_average_data$batter <- as.factor(batting_average_data$batter)

サンプルデータ生成

真の打率から、サンプルデータを作成

set.seed(5)
batting_average_raw_data <- do.call(rbind, lapply(1:nrow(batting_average_data), 
    function(i) {
        d <- batting_average_data[i, ]
        m <- d$batting_average_true
        n <- d$bats
        y <- rbinom(n, 1, m)
        data.frame(id = d$batter, hit = y, x = 1)
    }))

batting_average_data2 <- dcast(batting_average_raw_data, 
    id ~ "batting_average", function(x) {
        sum(x)/length(x)
    }, value.var = "hit")
batting_average_data2$batting_average_true <- batting_average_data$batting_average_true

生成されたサンプルデータ

print.xtable(xtable::xtable(batting_average_data2), 
    type = "html")
id batting_average batting_average_true
1 1 0.23 0.27
2 2 0.43 0.28
3 3 0.26 0.28
4 4 0.26 0.29
5 5 0.30 0.29
6 6 0.33 0.29
7 7 0.25 0.29
8 8 0.20 0.29
9 9 0.40 0.30
10 10 0.33 0.30
11 11 0.30 0.30
12 12 0.22 0.30
13 13 0.31 0.30
14 14 0.39 0.31
15 15 0.32 0.31
16 16 0.28 0.31
17 17 0.35 0.31
18 18 0.24 0.32
19 19 0.31 0.32
20 20 0.39 0.33

割り算推定打率と真の打率

ggplot(batting_average_data2, aes(x = batting_average_true, 
    y = batting_average, col = id)) + geom_point() + geom_abline() + 
    xlab("真の打率") + ylab("割り算推定打率") + 
    ylim(0.15, 0.45) + ggtitle("割り算推定打率と真の打率")

plot of chunk unnamed-chunk-6

階層ベイズモデルを使って打率を推定

rhierBinLogit用のデータを作成

id.list <- unique(batting_average_raw_data$id)
lgtdata <- lapply(id.list, function(id) {
    d <- batting_average_raw_data[as.numeric(batting_average_raw_data$id) == 
        as.numeric(id), ]
    list(X = matrix(d$x, nrow = nrow(d), ncol = 1), y = d$hit, 
        beta = matrix(1, nrow = 1, ncol = nrow(d)))
})

パラメータ推定

R <- 10000
set.seed(5)
out <- rhierBinLogit(Data = list(lgtdata = lgtdata), 
    Mcmc = list(R = R))

beta <- rowMeans(out$betadraw[, , seq(9000, 10000, 
    10)])
batting_average_data2$rhierBinLogitResult <- exp(beta)/(1 + 
    exp(beta))

階層ベイズモデルによる推定打率と真の打率

ggplot(batting_average_data2, aes(x = batting_average_true, 
    y = rhierBinLogitResult, col = id)) + geom_point() + 
    geom_abline() + xlab("真の打率") + ylab("階層ベイズモデル推定打率") + 
    ylim(0.15, 0.45) + ggtitle("階層ベイズモデル推定打率と真の打率")

plot of chunk unnamed-chunk-9

割り算推定打率と階層ベイズモデルによる推定打率の比較

batting_average_data.melt <- melt(batting_average_data2, 
    id.vars = c("batting_average_true"), measure.vars = c("batting_average", 
        "rhierBinLogitResult"))

ggplot(batting_average_data.melt, aes(x = batting_average_true, 
    y = value, col = variable)) + geom_point() + geom_abline() + 
    xlab("真の打率") + ylab("推定打率") + ylim(0.15, 
    0.45) + ggtitle("推定打率と真の打率")

plot of chunk unnamed-chunk-10