library(ggplot2)
library(reshape)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:reshape':
##
## rename
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
theme_set(theme_bw())
set.seed(42)
Repeatedly flip a biased coin 100 times and estimate its bias. Adapted from Yakir 11.2.3.
estimate_coin_bias <- function(n, p) {
mean(rbinom(n,1,p))
}
n <- 100
p <- 0.3
p_hat <- replicate(1e5, estimate_coin_bias(n, p))
# plot the sampling distribution
qplot(x=p_hat, geom="histogram", binwidth=0.01) +
geom_vline(xintercept=p) +
geom_vline(xintercept=mean(p_hat), linetype=2, color="red")
# repeat this for different sample sizes
plot_data <- data.frame()
for (n in c(100, 200, 400, 800)) {
tmp <- data.frame(n=n, p_hat=replicate(1e5, estimate_coin_bias(n, p)))
plot_data <- rbind(plot_data, tmp)
}
qplot(data=plot_data, x=p_hat, geom="histogram", binwidth=0.01, facets = . ~ n)
se <- plot_data %>%
group_by(n) %>%
summarize(se=sd(p_hat))
qplot(data=se, x=n, y=se) +
stat_function(fun=function(n) {sqrt(p * (1 - p) / n)}, linetype=2)
set.seed(42)
n <- 100
p <- 0.3
p_hat <- replicate(1e5, estimate_coin_bias(n, p))
# compute upper and lower confidence intervals
LCL <- p_hat - 1.96*sqrt(p_hat*(1-p_hat)/n)
UCL <- p_hat + 1.96*sqrt(p_hat*(1-p_hat)/n)
# check how often the true proportion is contained in the estimated confidence interval
mean(p >= LCL & p <= UCL)
## [1] 0.94983
# plot 100 confidence intervals and the true value
plot_data <- data.frame(p_hat, LCL, UCL)[1:100,]
plot_data <- transform(plot_data, contains_p=(p >= LCL & p <= UCL))
ggplot(data=plot_data, aes(x=1:nrow(plot_data), y=p_hat, color=contains_p)) +
geom_pointrange(aes(ymin=LCL, ymax=UCL)) +
geom_hline(yintercept=p, linetype=2) +
xlab('') +
scale_color_manual(values=c("red","darkgreen")) +
theme(legend.position="none")
# construct a null distribution: what would happen if the coin were fair?
n <- 100
p0_hat <- replicate(1e5, estimate_coin_bias(n, p=0.5))
# now conduct one experiment with 100 rolls from a biased coin
p_hat <- estimate_coin_bias(n, p=0.3)
# plot the null distribution and see where the observed estimate lies in it
qplot(x=p0_hat, geom="histogram", binwidth=0.01) +
geom_vline(xintercept=p_hat, linetype=2, color="red")
## Warning in loop_apply(n, do.ply): position_stack requires constant width:
## output may be incorrect
# compare this to our experiment
# how likely is it that we would see an estimate this extreme if the coin really were fair?
num_as_extreme <- sum(p0_hat <= p_hat)
p_value <- num_as_extreme / length(p0_hat)
Only 0 out of 100000 estimates from a fair coin with p=0.5 would result in an estimate of p_hat=0.26 or smaller, corresponding to a p-value of 0.