PS 9.4

S1:

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.5.2

Warning: package 'ggplot2' was built under R version 4.5.2

Warning: package 'readr' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.2.0
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(purrrfect)


Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate

library(knitr)

t_val <- 5

sim_data <- parameters(~n, ~beta,
                       c(20, 40, 80, 160, 320, 640), 
                       c(0.5, 2, 4)) %>%
  add_trials(10000) %>%
  mutate(
    Ysample = pmap(list(n, beta), \(n, b) rexp(n, rate = 1/b)),
    ybar = map_dbl(Ysample, mean),
    true_theta = exp(-t_val / beta),
    mle = exp(-t_val / ybar),
    se = (t_val * exp(-t_val / ybar)) / (ybar * sqrt(n)),
    lcl = mle - 1.96 * se,
    ucl = mle + 1.96 * se,
    covers = ifelse(lcl < true_theta & ucl > true_theta, 1, 0)
  )

coverage_summary <- sim_data %>%
  summarize(
    coverage = mean(covers),
    .by = c(n, beta)
  )

kable(coverage_summary)

n	beta	coverage
20	0.5	0.7468
20	2.0	0.8921
20	4.0	0.9256
40	0.5	0.8015
40	2.0	0.9168
40	4.0	0.9358
80	0.5	0.8402
80	2.0	0.9262
80	4.0	0.9401
160	0.5	0.8762
160	2.0	0.9398
160	4.0	0.9469
320	0.5	0.9029
320	2.0	0.9472
320	4.0	0.9499
640	0.5	0.9211
640	2.0	0.9515
640	4.0	0.9503

ggplot(data = coverage_summary) +
  geom_line(aes(x = n, y = coverage)) +
  geom_hline(aes(yintercept = 0.95), linetype = 2) +
  theme_classic(base_size = 14) +
  labs(y = 'Coverage', x = 'n') +
  facet_wrap(~beta, labeller = label_both)

A higher beta value seems to converge to the 95% coverage sooner.

S2:

loglik.alpha <- \(alpha, yvals) {
  sum(dgamma(yvals, shape = alpha, scale = mean(yvals) / alpha, log = TRUE))
}

get.mle <- \(yvals) {
  opt <- optimize(f = loglik.alpha, interval = c(0.01, 100), yvals = yvals, maximum = TRUE)
  c(alpha_MLE = opt$maximum, beta_MLE = mean(yvals) / opt$maximum)
}

my.y <- c(6.690889, 1.989313, 4.884504, 2.142505, 4.177150)
get.mle(my.y)

alpha_MLE  beta_MLE 
4.8222733 0.8246883

sim_data2 <- parameters(~n, ~alpha, ~beta,
                        c(5, 20, 50, 100, 150, 200),
                        4,
                        2) %>%
  add_trials(10000) %>%
  mutate(
    Ysample = pmap(list(n, alpha, beta), \(n, a, b) rgamma(n, shape = a, scale = b)),
    mles = map(Ysample, get.mle),
    alpha_mle = map_dbl(mles, 1),
    beta_mle = map_dbl(mles, 2)
  )

ggplot(sim_data2, aes(x = alpha_mle)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black") +
  geom_vline(xintercept = 4, color = "red", linewidth = 1) +
  facet_wrap(~n, labeller = label_both) +
  coord_cartesian(xlim = c(0, 15)) +
  theme_classic(base_size = 14) +
  labs(x = expression(hat(alpha)[MLE]), y = "Count")

ggplot(sim_data2, aes(x = beta_mle)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black") +
  geom_vline(xintercept = 2, color = "red", linewidth = 1) +
  facet_wrap(~n, labeller = label_both) +
  coord_cartesian(xlim = c(0, 5)) +
  theme_classic(base_size = 14) +
  labs(x = expression(hat(beta)[MLE]), y = "Count")

As n increases the histograms (our MLE estimators) appear to normalize around Beta = 2 and Alpha = 4

S3:

sim_data3 <- parameters(~n, ~p,
                        c(10, 20, 40, 80, 160, 320, 640, 1280),
                        c(0.1, 0.2, 0.3, 0.4, 0.5)) %>%
  add_trials(10000) %>%
  mutate(
    p_hat = pmap_dbl(list(n, p), \(n, prob) rbinom(1, n, prob) / n),
    tau_hat = p_hat * (1 - p_hat),
    true_tau = p * (1 - p),
    est_crlb = (p_hat * (1 - p_hat) * (1 - 2 * p_hat)^2) / n,
    se = sqrt(est_crlb),
    lcl = tau_hat - 1.96 * se,
    ucl = tau_hat + 1.96 * se,
    covers = ifelse(lcl <= true_tau & ucl >= true_tau, 1, 0)
  )

ggplot(sim_data3, aes(x = tau_hat)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  facet_grid(p ~ n, scales = "free", labeller = label_both) +
  theme_classic(base_size = 14) +
  labs(x = expression(hat(tau)(p)), y = "Density")

coverage_data <- sim_data3 %>%
  summarize(
    coverage = mean(covers, na.rm = TRUE),
    .by = c(n, p)
  )

ggplot(coverage_data, aes(x = n, y = coverage)) +
  geom_line() +
  geom_hline(yintercept = 0.95, linetype = "dashed") +
  facet_wrap(~p, labeller = label_both) +
  theme_classic(base_size = 14) +
  labs(x = "n", y = "Coverage")

The closer p is to 0.5, the normal curve takes longer to form.
At exactly p = 0.5 it never becomes normal. It hits a ceiling and gets stuck skewed to the side, no matter how large the sample is.

Because the distribution is jammed against that ceiling, the standard error math breaks down and makes the confidence interval way too wide.

This extra-wide interval overshoots the top boundary, meaning it catches the true answer 100% of the time instead of the target 95%.

Since p is a probability between 0 and 1, the formula tau(p) = p(1-p) mathematically tops out at 0.25.
when p=.5, it’s right at that edge.