binomial simulation for confidence intervals 95%

binomSim <- function(n, correct = 0) {
    pvals <-
    seq(.1, .9, by = .05) #true probabilty of getting a head array
    nosim <- 1000 #number of simulations
    
    #Calculate the coverage where true probability falls within CI
    coverage <- sapply(pvals, function(p) {
    phats <-
    (rbinom(nosim, prob = p, size = n) + correct) / (n + correct*2) #generate 1000 random binomial numbers, from value 0 to 20, and calculate p hat, which is the average of the 1000 observations
    ll <-
    phats - qnorm(.975) * sqrt(phats * (1 - phats) / n) #lower limit of CI, notice phats is used to calculate sd, instead of p, this is called Wald confidence interval, we can also use 1/2, which gives roughly phats - 1/sqrt(n)
    ul <-
    phats + qnorm(.975) * sqrt(phats * (1 - phats) / n) #upper limit of CI
    mean(p > ll & p < ul) #the probabilty of p falls within CI
    })
    
    ggplot(mapping = aes(x=pvals, y=coverage)) + geom_line() + geom_hline(yintercept = .95, color = "red", size = 2) + geom_hline(yintercept = .9, color = "red", size = 2)
}
binomSim(20)

As we can see from the plot, the coverage is not that great, with only when p = .5 when coverage is > 95%.

Now let’s try increase this sample size to 100.

binomSim(100)

Slightly better, but still not ideal.

Now let’s try fix this with Agresti/Coull interval, by adding 2 successes and 2 failures to the result

binomSim(20, 2)

What if we add 3 successes and failures

binomSim(20, 3)

Poison Interval

Calculation

Example: A nuclear pump failed 5 (X) times out of 94.32 days (t, or n), give 95% CI for the failure rate per day (lambda)?

Assume X ~ Poisson(lambda * t), the mean and variance of a Poisson distribution is both lamba, so CI is lambdaHat +/- qnorm(.975) x sqrt(lambdaHat/t)

x <- 5
t <- 94.32
lambaHat <- x/t

#Since number is big, we can assume CLT holds and that lamba mean ~ normal distribution
ci <- lambaHat + c(-1,1)*qnorm(.975)*sqrt(lambaHat/t)
ci
## [1] 0.006545667 0.099476386
#When number is small, we can use Chi square stats instead


#Now try with Poisson test, we are not really testing anything, so r: hypothesized rate and alternative hypothesis does not matter, we use default
poisson.test(5, 94.32, conf.level = .95)$conf.int
## [1] 0.01721254 0.12371005
## attr(,"conf.level")
## [1] 0.95

Simultaions for Poisson distribution confidence interval coverage

poissonSim <- function(t) {
    lambavals <- seq(0.005, 0.1, by = 0.01)
    nosim <- 1000
    
    coverage <- sapply(lambavals, function(lamba){
        lhats <- rpois(nosim, lambda = lamba*t)/t
        delta <- qnorm(.975) * sqrt(lhats/t)
        ll <- lhats - delta
        ul <- lhats + delta
        mean(lamba > ll & lamba < ul) #I used && before, and got incorrect result, & is vectorized, and && is not and only return 1 boolean val
    })
    
    ggplot(mapping = aes(x=lambavals, y=coverage)) + geom_line() + geom_hline(yintercept = .95, color = "red", size = 1.5) +  geom_hline(yintercept = .9, color = "black", size = 1.5) +
        ylim(0,1)

    }

Small sample size: t = 100

poissonSim(100)

Bigger sample size: t = 1000

poissonSim(1000)