贝塔分布

library(ggplot2)
x <- seq(0,1,length=100)
db <- dbeta(x, 81, 219)
ggplot() + geom_line(aes(x,db)) + ylab("Density of beta")

为什么击球的概率分布符合贝塔分布?

\[f(q) \propto q^a(1-q)^b\]

\[Beta(\alpha+1,\beta+0)\]

先验与后验

x <- seq(0,1,length=100)
db <- dbeta(x, 81+1, 219)
ggplot() + geom_line(aes(x,db)) + ylab("Density of beta")

x <- seq(0,1,length=100)
db <- dbeta(x, 81+1000, 219)
ggplot() + geom_line(aes(x,db)) + ylab("Density of beta")

\[\frac{a}{b} = \frac{c}{d} = \frac{a+b}{c+d}\]

经验贝叶斯

library(knitr)
library(dplyr)
library(tidyr)
library(Lahman)
# 拿到击球数据
career <- Batting %>%
  filter(AB > 0) %>%
  anti_join(Pitching, by = "playerID") %>%
  group_by(playerID) %>%
  summarize(H = sum(H), AB = sum(AB)) %>%
  mutate(average = H / AB)

# 把ID换成球员名字
career <- Master %>%
  tbl_df() %>%
  select(playerID, nameFirst, nameLast) %>%
  unite(name, nameFirst, nameLast, sep = " ") %>%
  inner_join(career, by = "playerID") %>%
  select(-playerID)
# 展示数据
career
## Source: local data frame [9,342 x 4]
## 
##                 name     H    AB average
##                (chr) (int) (int)   (dbl)
## 1         Hank Aaron  3771 12364  0.3050
## 2       Tommie Aaron   216   944  0.2288
## 3          Andy Abad     2    21  0.0952
## 4        John Abadie    11    49  0.2245
## 5     Ed Abbaticchio   772  3044  0.2536
## 6        Fred Abbott   107   513  0.2086
## 7        Jeff Abbott   157   596  0.2634
## 8        Kurt Abbott   523  2044  0.2559
## 9         Ody Abbott    13    70  0.1857
## 10 Frank Abercrombie     0     4  0.0000
## ..               ...   ...   ...     ...
# 击球前5
career %>%
  arrange(desc(average)) %>%
  head(5) %>%
  kable()
name H AB average
Jeff Banister 1 1 1
Doc Bass 1 1 1
Steve Biras 2 2 1
C. B. Burns 1 1 1
Jackie Gallagher 1 1 1
# 击球后5
career %>%
  arrange(average) %>%
  head(5) %>%
  kable()
name H AB average
Frank Abercrombie 0 4 0
Lane Adams 0 3 0
Horace Allen 0 7 0
Pete Allen 0 4 0
Walter Alston 0 1 0
career_filtered <- career %>%
    filter(AB >= 500)

m <- MASS::fitdistr(career_filtered$average, dbeta,
                    start = list(shape1 = 1, shape2 = 10))

alpha0 <- m$estimate[1]
beta0 <- m$estimate[2]

# 看下拟合效果

ggplot(career_filtered) +
  geom_histogram(aes(average, y = ..density..), binwidth = .005) +
  stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
                size = 1) +
  xlab("Batting average")

从整体到个人

career_eb <- career %>%
    mutate(eb_estimate = (H + alpha0) / (AB + alpha0 + beta0))
# 击球率高
career_eb %>%
  arrange(desc(eb_estimate)) %>%
  head(5) %>%
  kable()
name H AB average eb_estimate
Rogers Hornsby 2930 8173 0.358 0.355
Shoeless Joe Jackson 1772 4981 0.356 0.350
Ed Delahanty 2596 7505 0.346 0.343
Billy Hamilton 2158 6268 0.344 0.340
Harry Heilmann 2660 7787 0.342 0.338
# 击球率低
career_eb %>%
  arrange(eb_estimate) %>%
  head(5) %>%
  kable()
name H AB average eb_estimate
Bill Bergen 516 3028 0.170 0.179
Ray Oyler 221 1265 0.175 0.191
John Vukovich 90 559 0.161 0.196
John Humphries 52 364 0.143 0.196
George Baker 74 474 0.156 0.196
# 整体估计
ggplot(career_eb, aes(average, eb_estimate, color = AB)) +
  geom_hline(yintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) +
  geom_point() +
  geom_abline(color = "red") +
  scale_colour_gradient(trans = "log", breaks = 10 ^ (1:5)) +
  xlab("Batting average") +
  ylab("Empirical Bayes batting average")

可信区间与置信区间

# 给出后验分布
career <- Batting %>%
  filter(AB > 0) %>%
  anti_join(Pitching, by = "playerID") %>%
  group_by(playerID) %>%
  summarize(H = sum(H), AB = sum(AB)) %>%
  mutate(average = H / AB)

career <- Master %>%
  tbl_df() %>%
  select(playerID, nameFirst, nameLast) %>%
  unite(name, nameFirst, nameLast, sep = " ") %>%
  inner_join(career, by = "playerID")
career_eb <- career %>%
    mutate(eb_estimate = (H + alpha0) / (AB + alpha0 + beta0))
career_eb <- career_eb %>%
    mutate(alpha1 = H + alpha0,
           beta1 = AB - H + beta0)
# 提取洋基队的数据
yankee_1998 <- c("brosisc01", "jeterde01", "knoblch01", "martiti02", "posadjo01", "strawda01", "willibe02")

yankee_1998_career <- career_eb %>%
    filter(playerID %in% yankee_1998)

# 展示球员的后验分布
library(broom)
yankee_beta <- yankee_1998_career %>%
    inflate(x = seq(.18, .33, .0002)) %>%
    ungroup() %>%
    mutate(density = dbeta(x, alpha1, beta1))

ggplot(yankee_beta, aes(x, density, color = name)) +
    geom_line() +
    stat_function(fun = function(x) dbeta(x, alpha0, beta0),
                  lty = 2, color = "black")

# 提取可信区间
yankee_1998_career <- yankee_1998_career %>%
    mutate(low  = qbeta(.025, alpha1, beta1),
           high = qbeta(.975, alpha1, beta1))
yankee_1998_career %>%
    select(-alpha1, -beta1, -eb_estimate) %>%
    knitr::kable()
playerID name H AB average low high
brosisc01 Scott Brosius 1001 3889 0.257 0.244 0.271
jeterde01 Derek Jeter 3465 11195 0.310 0.300 0.317
knoblch01 Chuck Knoblauch 1839 6366 0.289 0.277 0.298
martiti02 Tino Martinez 1925 7111 0.271 0.260 0.280
posadjo01 Jorge Posada 1664 6092 0.273 0.262 0.283
strawda01 Darryl Strawberry 1401 5418 0.259 0.247 0.270
willibe02 Bernie Williams 2336 7869 0.297 0.286 0.305
# 绘制可信区间
yankee_1998_career %>%
    mutate(name = reorder(name, average)) %>%
    ggplot(aes(average, name)) +
    geom_point() +
    geom_errorbarh(aes(xmin = low, xmax = high)) +
    geom_vline(xintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) +
    xlab("Estimated batting average (w/ 95% interval)") +
    ylab("Player")

# 对比置信区间与可信区间
career_eb <- career_eb %>%
    mutate(low = qbeta(.025, alpha1, beta1),
           high = qbeta(.975, alpha1, beta1))

set.seed(2016)

some <- career_eb %>%
    sample_n(20) %>%
    mutate(name = paste0(name, " (", H, "/", AB, ")"))

frequentist <- some %>%
    group_by(playerID, name, AB) %>%
    do(tidy(binom.test(.$H, .$AB))) %>%
    select(playerID, name, estimate, low = conf.low, high = conf.high) %>%
    mutate(method = "Confidence")

bayesian <- some %>%
    select(playerID, name, AB, estimate = eb_estimate,
           low = low, high = high) %>%
    mutate(method = "Credible")

combined <- bind_rows(frequentist, bayesian)

combined %>%
    mutate(name = reorder(name, -AB)) %>%
    ggplot(aes(estimate, name, color = method, group = method)) +
    geom_point() +
    geom_errorbarh(aes(xmin = low, xmax = high)) +
    geom_vline(xintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) +
    xlab("Estimated batting average") +
    ylab("Player") +
    labs(color = "")

后验错误率

# 以 Hank Aaron 为例
career_eb %>%
    filter(name == "Hank Aaron") %>%
    do(data_frame(x = seq(.27, .33, .0002),
                  density = dbeta(x, .$alpha1, .$beta1))) %>%
    ggplot(aes(x, density)) +
    geom_line() +
    geom_ribbon(aes(ymin = 0, ymax = density * (x < .3)),
                alpha = .1, fill = "red") +
    geom_vline(color = "red", lty = 2, xintercept = .3)

# 提取该球员数据
career_eb %>% filter(name == "Hank Aaron")
## Source: local data frame [1 x 10]
## 
##    playerID       name     H    AB average eb_estimate alpha1 beta1   low
##       (chr)      (chr) (int) (int)   (dbl)       (dbl)  (dbl) (dbl) (dbl)
## 1 aaronha01 Hank Aaron  3771 12364   0.305       0.304   3850  8819 0.296
## Variables not shown: high (dbl)
# 计算其不进入名人堂的概率
pbeta(.3, 3850, 8818)
## [1] 0.169
# 所有球员的后验错误率分布,大部分不超过0.3
career_eb <- career_eb %>%
    mutate(PEP = pbeta(.3, alpha1, beta1))
ggplot(career_eb, aes(PEP)) +
    geom_histogram(binwidth = .02) +
    xlab("Posterior Error Probability (PEP)") +
    xlim(0, 1)

# 后验错误率与击球率的关系
career_eb %>%
    ggplot(aes(eb_estimate, PEP, color = AB)) +
    geom_point(size = 1) +
    xlab("(Shrunken) batting average estimate") +
    ylab("Posterior Error Probability (PEP)") +
    geom_vline(color = "red", lty = 2, xintercept = .3) +
    scale_colour_gradient(trans = "log", breaks = 10 ^ (1:5))

错误发现率(FDR)

# 取前100个球员
top_players <- career_eb %>%
    arrange(PEP) %>%
    head(100)
# 总错率率
sum(top_players$PEP)
## [1] 4.69
# 平均错误率
mean(top_players$PEP)
## [1] 0.0469
# 错误率随所取球员的变化
sorted_PEP <- career_eb %>%
    arrange(PEP)

mean(head(sorted_PEP$PEP, 50))
## [1] 0.00113
mean(head(sorted_PEP$PEP, 200))
## [1] 0.241

q值

# 生成每个球员的q值
career_eb <- career_eb %>%
    arrange(PEP) %>%
    mutate(qvalue = cummean(PEP))
# 观察不同q值对名人堂球员数的影响
career_eb %>%
    ggplot(aes(qvalue, rank(PEP))) +
    geom_line() +
    xlab("q-value cutoff") +
    ylab("Number of players included")

# 观察小q值部分
career_eb %>%
    filter(qvalue < .25) %>%
    ggplot(aes(qvalue, rank(PEP))) +
    geom_line() +
    xlab("q-value cutoff") +
    ylab("Number of players included")

贝叶斯视角的假设检验

# 选三个球员
career_eb %>%
  filter(name %in% c("Hank Aaron", "Mike Piazza", "Hideki Matsui")) %>%
  inflate(x = seq(.26, .33, .00025)) %>%
  mutate(density = dbeta(x, alpha1, beta1)) %>%
  ggplot(aes(x, density, color = name)) +
  geom_line() +
  labs(x = "Batting average", color = "")

模拟

  • 单纯取样比大小然后计算比例
# 提取两人数据
aaron <- career_eb %>% filter(name == "Hank Aaron")
piazza <- career_eb %>% filter(name == "Mike Piazza")
# 模拟取样10万次
piazza_simulation <- rbeta(1e6, piazza$alpha1, piazza$beta1)
aaron_simulation <- rbeta(1e6, aaron$alpha1, aaron$beta1)
# 计算一个人超过另一个人的概率
sim <- mean(piazza_simulation > aaron_simulation)
sim
## [1] 0.604

数值积分

  • 两个概率的联合概率分布,然后积分一个队员大于另一个的概率
d <- .00002
limits <- seq(.29, .33, d)
sum(outer(limits, limits, function(x, y) {
  (x > y) *
    dbeta(x, piazza$alpha1, piazza$beta1) *
    dbeta(y, aaron$alpha1, aaron$beta1) *
    d ^ 2
}))
## [1] 0.604

解析解

  • 两个贝塔分布一个比另一个高是有含有贝塔函数的解析解的:

\[p_A \sim \mbox{Beta}(\alpha_A, \beta_A)\]

\[p_B \sim \mbox{Beta}(\alpha_B, \beta_B)\]

\[{\rm Pr}(p_B > p_A) = \sum_{i=0}^{\alpha_B-1}\frac{B(\alpha_A+i,\beta_A+\beta_B)}{(\beta_B+i) B(1+i, \beta_B) B(\alpha_A, \beta_A) }\]

h <- function(alpha_a, beta_a,
              alpha_b, beta_b) {
  j <- seq.int(0, round(alpha_b) - 1)
  log_vals <- (lbeta(alpha_a + j, beta_a + beta_b) - log(beta_b + j) -
               lbeta(1 + j, beta_b) - lbeta(alpha_a, beta_a))
  1 - sum(exp(log_vals))
}

h(piazza$alpha1, piazza$beta1,
  aaron$alpha1, aaron$beta1)
## [1] 0.605

正态近似求解

  • 贝塔分布在\(\alpha\)\(\beta\)比较大时接近正态分布,可以直接用正态分布的解析解求,速度快很多
h_approx <- function(alpha_a, beta_a,
                     alpha_b, beta_b) {
  u1 <- alpha_a / (alpha_a + beta_a)
  u2 <- alpha_b / (alpha_b + beta_b)
  var1 <- alpha_a * beta_a / ((alpha_a + beta_a) ^ 2 * (alpha_a + beta_a + 1))
  var2 <- alpha_b * beta_b / ((alpha_b + beta_b) ^ 2 * (alpha_b + beta_b + 1))
  pnorm(0, u2 - u1, sqrt(var1 + var2))
}

h_approx(piazza$alpha1, piazza$beta1, aaron$alpha1, aaron$beta1)
## [1] 0.606

比例检验

two_players <- bind_rows(aaron, piazza)

two_players %>%
  transmute(Player = name, Hits = H, Misses = AB - H) %>%
  knitr::kable()
Player Hits Misses
Hank Aaron 3771 8593
Mike Piazza 2127 4784
prop.test(two_players$H, two_players$AB)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  two_players$H out of two_players$AB
## X-squared = 0.1, df = 1, p-value = 0.7
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0165  0.0109
## sample estimates:
## prop 1 prop 2 
##  0.305  0.308
credible_interval_approx <- function(a, b, c, d) {
  u1 <- a / (a + b)
  u2 <- c / (c + d)
  var1 <- a * b / ((a + b) ^ 2 * (a + b + 1))
  var2 <- c * d / ((c + d) ^ 2 * (c + d + 1))

  mu_diff <- u2 - u1
  sd_diff <- sqrt(var1 + var2)

  data_frame(posterior = pnorm(0, mu_diff, sd_diff),
             estimate = mu_diff,
             conf.low = qnorm(.025, mu_diff, sd_diff),
             conf.high = qnorm(.975, mu_diff, sd_diff))
}

credible_interval_approx(piazza$alpha1, piazza$beta1, aaron$alpha1, aaron$beta1)
## Source: local data frame [1 x 4]
## 
##   posterior estimate conf.low conf.high
##       (dbl)    (dbl)    (dbl)     (dbl)
## 1     0.606 -0.00182  -0.0151    0.0115
set.seed(2016)

intervals <- career_eb %>%
  filter(AB > 10) %>%
  sample_n(20) %>%
  group_by(name, H, AB) %>%
  do(credible_interval_approx(piazza$alpha1, piazza$beta1, .$alpha1, .$beta1)) %>%
  ungroup() %>%
  mutate(name = reorder(paste0(name, " (", H, " / ", AB, ")"), -estimate))
f <- function(H, AB) broom::tidy(prop.test(c(H, piazza$H), c(AB, piazza$AB)))
prop_tests <- purrr::map2_df(intervals$H, intervals$AB, f) %>%
  mutate(estimate = estimate1 - estimate2,
         name = intervals$name)

all_intervals <- bind_rows(
  mutate(intervals, type = "Credible"),
  mutate(prop_tests, type = "Confidence")
)

ggplot(all_intervals, aes(x = estimate, y = name, color = type)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  xlab("Piazza average - player average") +
  ylab("Player")

错误率控制

# 对比打算交易的球员与其他球员
career_eb_vs_piazza <- bind_cols(
  career_eb,
  credible_interval_approx(piazza$alpha1, piazza$beta1,
                           career_eb$alpha1, career_eb$beta1)) %>%
  select(name, posterior, conf.low, conf.high)

career_eb_vs_piazza
## Source: local data frame [9,342 x 4]
## 
##                    name posterior conf.low conf.high
##                   (chr)     (dbl)    (dbl)     (dbl)
## 1        Rogers Hornsby  2.84e-11   0.0345    0.0639
## 2          Ed Delahanty  7.10e-07   0.0218    0.0518
## 3  Shoeless Joe Jackson  8.77e-08   0.0278    0.0611
## 4         Willie Keeler  4.62e-06   0.0183    0.0472
## 5            Nap Lajoie  1.62e-05   0.0158    0.0441
## 6            Tony Gwynn  1.83e-05   0.0157    0.0442
## 7        Harry Heilmann  7.19e-06   0.0180    0.0476
## 8            Lou Gehrig  1.43e-05   0.0167    0.0461
## 9        Billy Hamilton  7.03e-06   0.0190    0.0502
## 10        Eddie Collins  2.00e-04   0.0113    0.0393
## ..                  ...       ...      ...       ...
# 计算q值
career_eb_vs_piazza <- career_eb_vs_piazza %>%
  arrange(posterior) %>%
  mutate(qvalue = cummean(posterior))

# 筛选那些q值小于0.05的
better <- career_eb_vs_piazza %>%
  filter(qvalue < .05)

better
## Source: local data frame [50 x 5]
## 
##                    name posterior conf.low conf.high   qvalue
##                   (chr)     (dbl)    (dbl)     (dbl)    (dbl)
## 1        Rogers Hornsby  2.84e-11   0.0345    0.0639 2.84e-11
## 2  Shoeless Joe Jackson  8.77e-08   0.0278    0.0611 4.39e-08
## 3          Ed Delahanty  7.10e-07   0.0218    0.0518 2.66e-07
## 4         Willie Keeler  4.62e-06   0.0183    0.0472 1.36e-06
## 5        Billy Hamilton  7.03e-06   0.0190    0.0502 2.49e-06
## 6        Harry Heilmann  7.19e-06   0.0180    0.0476 3.27e-06
## 7            Lou Gehrig  1.43e-05   0.0167    0.0461 4.85e-06
## 8            Nap Lajoie  1.62e-05   0.0158    0.0441 6.28e-06
## 9            Tony Gwynn  1.83e-05   0.0157    0.0442 7.62e-06
## 10           Bill Terry  3.03e-05   0.0162    0.0472 9.89e-06
## ..                  ...       ...      ...       ...      ...

影响因子

career %>%
  filter(AB >= 20) %>%
  ggplot(aes(AB, average)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_x_log10()

prior_mu <- alpha0 / (alpha0 + beta0)
career_eb %>%
  filter(AB >= 20) %>%
  gather(type, value, average, eb_estimate) %>%
  mutate(type = plyr::revalue(type, c(average = "Raw",
                                      eb_estimate = "With EB Shrinkage"))) %>%
  ggplot(aes(AB, value)) +
  geom_point() +
  scale_x_log10() +
  geom_hline(color = "red", lty = 2, size = 1.5, yintercept = prior_mu) +
  facet_wrap(~type) +
  ylab("average") +
    geom_smooth(method = "lm")

\[\mu_i = \mu_0 + \mu_{\mbox{AB}} \cdot \log(\mbox{AB})\]

\[\alpha_{0,i} = \mu_i / \sigma_0\]

\[\beta_{0,i} = (1 - \mu_i) / \sigma_0\]

\[p_i \sim \mbox{Beta}(\alpha_{0,i}, \beta_{0,i})\]

\[H_i \sim \mbox{Binom}(\mbox{AB}_i, p_i)\]

拟合模型

  • 寻找拟合后的模型参数,构建新的先验概率
library(gamlss)
# 拟合模型
fit <- gamlss(cbind(H, AB - H) ~ log(AB),
              data = career_eb,
              family = BB(mu.link = "identity"))
## GAMLSS-RS iteration 1: Global Deviance = 91083 
## GAMLSS-RS iteration 2: Global Deviance = 72051 
## GAMLSS-RS iteration 3: Global Deviance = 67972 
## GAMLSS-RS iteration 4: Global Deviance = 67966 
## GAMLSS-RS iteration 5: Global Deviance = 67966
library(broom)
# 展示拟合参数
td <- tidy(fit)
td
##   parameter        term estimate std.error statistic p.value
## 1        mu (Intercept)   0.1441  0.001616      89.1       0
## 2        mu     log(AB)   0.0151  0.000221      68.5       0
## 3     sigma (Intercept)  -6.3372  0.024910    -254.4       0
# 构建新的先验概率
mu_0 <- td$estimate[1]
mu_AB <- td$estimate[2]
sigma <- exp(td$estimate[3])

# 看看AB对先验概率的影响
crossing(x = seq(0.08, .35, .001), AB = c(1, 10, 100, 1000, 10000)) %>%
  mutate(density = dbeta(x, (mu_0 + mu_AB * log(AB)) / sigma,
                         (1 - (mu_0 + mu_AB * log(AB))) / sigma)) %>%
  mutate(AB = factor(AB)) %>%
  ggplot(aes(x, density, color = AB, group = AB)) +
  geom_line() +
  xlab("Batting average") +
  ylab("Prior density")

求后验概率

# 计算所有拟合值
mu <- fitted(fit, parameter = "mu")
sigma <- fitted(fit, parameter = "sigma")
# 计算所有后验概率
career_eb_wAB <- career_eb %>%
  dplyr::select(name, H, AB, original_eb = eb_estimate) %>%
  mutate(mu = mu,
         alpha0 = mu / sigma,
         beta0 = (1 - mu) / sigma,
         alpha1 = alpha0 + H,
         beta1 = beta0 + AB - H,
         new_eb = alpha1 / (alpha1 + beta1))
# 展示拟合后的击球率
ggplot(career_eb_wAB, aes(original_eb, new_eb, color = AB)) +
  geom_point() +
  geom_abline(color = "red") +
  xlab("Original EB Estimate") +
  ylab("EB Estimate w/ AB term") +
  scale_color_continuous(trans = "log", breaks = 10 ^ (0:4))

# 对比
library(tidyr)

lev <- c(raw = "Raw H / AB", original_eb = "EB Estimate", new_eb = "EB w/ Regression")

career_eb_wAB %>%
  filter(AB >= 10) %>%
  mutate(raw = H / AB) %>%
  gather(type, value, raw, original_eb, new_eb) %>%
  mutate(mu = ifelse(type == "original_eb", prior_mu,
                     ifelse(type == "new_eb", mu, NA))) %>%
  mutate(type = factor(plyr::revalue(type, lev), lev)) %>%
  ggplot(aes(AB, value)) +
  geom_point() +
  geom_line(aes(y = mu), color = "red") +
  scale_x_log10() +
  facet_wrap(~type) +
  xlab("At-Bats (AB)") +
  ylab("Estimate")

  • 矫正后我们的数据更复合现实了,其实这是贝叶斯分层模型的一个简单版本,通过考虑更多因素,我们可以构建更复杂的模型来挖掘出我们所需要的信息