PS_8_2

Question 5 : for n = { 5, 10, 20, 40, 80, 160, 320}

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.5.2

Warning: package 'ggplot2' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(purrrfect)


Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate

(sim5 <- parameters(~theta, ~n, 2, c(5, 10, 20, 40, 80, 160, 320)) %>% 
  add_trials(10000) %>% 
  mutate(
    y_sample = pmap(list(theta, n), .f = \(theta, n) rbeta(n, theta, 1)),
    Ybar = map_dbl(y_sample, mean),
    S2   = map_dbl(y_sample, var),
    Yn   = map_dbl(y_sample, max),
    LCL_exact = log(0.975) / (n * log(Yn)),
    UCL_exact = log(0.025) / (n * log(Yn)),
    mu_L = Ybar - 1.96 * sqrt(S2 / n),
    mu_U = Ybar + 1.96 * sqrt(S2 / n),
    LCL_asymptotic = mu_L / (1 - mu_L),
    UCL_asymptotic = mu_U / (1 - mu_U),
    exact_covers = (LCL_exact <= theta) & (UCL_exact >= theta),
    asymp_covers = (LCL_asymptotic <= theta) & (UCL_asymptotic >= theta)
  )
)

# A tibble: 70,000 × 15
   theta     n .trial y_sample   Ybar     S2    Yn LCL_exact UCL_exact  mu_L
   <dbl> <dbl>  <dbl> <list>    <dbl>  <dbl> <dbl>     <dbl>     <dbl> <dbl>
 1     2     5      1 <dbl [5]> 0.661 0.0565 0.927    0.0670      9.76 0.453
 2     2     5      2 <dbl [5]> 0.588 0.146  0.960    0.125      18.3  0.253
 3     2     5      3 <dbl [5]> 0.482 0.0939 0.940    0.0819     11.9  0.214
 4     2     5      4 <dbl [5]> 0.647 0.0897 0.999    3.58      522.   0.384
 5     2     5      5 <dbl [5]> 0.797 0.0125 0.921    0.0618      9.00 0.699
 6     2     5      6 <dbl [5]> 0.557 0.0454 0.843    0.0296      4.31 0.370
 7     2     5      7 <dbl [5]> 0.568 0.0461 0.940    0.0813     11.8  0.380
 8     2     5      8 <dbl [5]> 0.635 0.0410 0.980    0.250      36.4  0.458
 9     2     5      9 <dbl [5]> 0.600 0.0576 0.859    0.0333      4.85 0.390
10     2     5     10 <dbl [5]> 0.782 0.0312 0.970    0.165      24.1  0.627
# ℹ 69,990 more rows
# ℹ 5 more variables: mu_U <dbl>, LCL_asymptotic <dbl>, UCL_asymptotic <dbl>,
#   exact_covers <lgl>, asymp_covers <lgl>

sim5_summary <- sim5 %>%
  group_by(n) %>%
  summarize(
    exact_coverage = mean(exact_covers, na.rm = TRUE),
    asymp_coverage = mean(asymp_covers, na.rm = TRUE)
  )

print(sim5_summary)

# A tibble: 7 × 3
      n exact_coverage asymp_coverage
  <dbl>          <dbl>          <dbl>
1     5          0.950          0.822
2    10          0.950          0.907
3    20          0.951          0.930
4    40          0.952          0.940
5    80          0.951          0.948
6   160          0.949          0.952
7   320          0.950          0.947

ggplot(sim5_summary, aes(x = n)) +
  geom_line(aes(y = exact_coverage, color = "Exact")) +
  geom_line(aes(y = asymp_coverage, color = "Asymptotic")) +
  geom_hline(yintercept = 0.95, linetype = "dashed", color = "black") +
  scale_x_continuous(trans = 'log10', breaks = c(5, 10, 20, 40, 80, 160, 320)) +
  labs(
    title = "Exact vs Asymptotic Coverage",
    x = "n",
    y = "Simulated Coverage Proportion",
    color = "Interval Type"
  ) +
  theme_classic()

Question 6

library(knitr)

true_params <- list(
  mu  = c(UNIF = 4, EXP = 4, GEO = 4),
  var = c(UNIF = 64/12, EXP = 16, GEO = 20)
)

(sim6 <- parameters(~n, c(5, 20, 50, 100)) %>%
  add_trials(10000) %>%
  mutate(
    samp_unif = map(n, ~runif(.x, min = 0, max = 8)),
    samp_exp  = map(n, ~rexp(.x, rate = 1/4)),
    samp_geo  = map(n, ~rgeom(.x, prob = 0.2)),
    
    ybar_unif = map_dbl(samp_unif, mean),
    ybar_exp  = map_dbl(samp_exp, mean),
    ybar_geo  = map_dbl(samp_geo, mean),
    
    s2_unif = map_dbl(samp_unif, var),
    s2_exp  = map_dbl(samp_exp, var),
    s2_geo  = map_dbl(samp_geo, var),
    
    t_crit = qt(0.975, df = n - 1),
    
    mu_lcl_unif = ybar_unif - t_crit * sqrt(s2_unif / n),
    mu_ucl_unif = ybar_unif + t_crit * sqrt(s2_unif / n),
    
    mu_lcl_exp = ybar_exp - t_crit * sqrt(s2_exp / n),
    mu_ucl_exp = ybar_exp + t_crit * sqrt(s2_exp / n),
    
    mu_lcl_geo = ybar_geo - t_crit * sqrt(s2_geo / n),
    mu_ucl_geo = ybar_geo + t_crit * sqrt(s2_geo / n),
    
    cov_mu_unif = (mu_lcl_unif <= true_params$mu["UNIF"]) & (mu_ucl_unif >= true_params$mu["UNIF"]),
    cov_mu_exp  = (mu_lcl_exp <= true_params$mu["EXP"]) & (mu_ucl_exp >= true_params$mu["EXP"]),
    cov_mu_geo  = (mu_lcl_geo <= true_params$mu["GEO"]) & (mu_ucl_geo >= true_params$mu["GEO"]),
    
    chi_lower = qchisq(0.025, df = n - 1),
    chi_upper = qchisq(0.975, df = n - 1),
    
    var_lcl_unif = (n - 1) * s2_unif / chi_upper,
    var_ucl_unif = (n - 1) * s2_unif / chi_lower,
    
    var_lcl_exp = (n - 1) * s2_exp / chi_upper,
    var_ucl_exp = (n - 1) * s2_exp / chi_lower,
    
    var_lcl_geo = (n - 1) * s2_geo / chi_upper,
    var_ucl_geo = (n - 1) * s2_geo / chi_lower,
    
    cov_var_unif = (var_lcl_unif <= true_params$var["UNIF"]) & (var_ucl_unif >= true_params$var["UNIF"]),
    cov_var_exp  = (var_lcl_exp <= true_params$var["EXP"]) & (var_ucl_exp >= true_params$var["EXP"]),
    cov_var_geo  = (var_lcl_geo <= true_params$var["GEO"]) & (var_ucl_geo >= true_params$var["GEO"])
  )
)

# A tibble: 40,000 × 32
       n .trial samp_unif samp_exp  samp_geo ybar_unif ybar_exp ybar_geo s2_unif
   <dbl>  <dbl> <list>    <list>    <list>       <dbl>    <dbl>    <dbl>   <dbl>
 1     5      1 <dbl [5]> <dbl [5]> <int>         3.64     5.64      1.2    7.92
 2     5      2 <dbl [5]> <dbl [5]> <int>         2.70     3.09      3.8    2.35
 3     5      3 <dbl [5]> <dbl [5]> <int>         4.02     4.29      4     10.4 
 4     5      4 <dbl [5]> <dbl [5]> <int>         5.65     2.44      2      2.28
 5     5      5 <dbl [5]> <dbl [5]> <int>         1.46     2.59      4.2    4.30
 6     5      6 <dbl [5]> <dbl [5]> <int>         5.06     6.22      8      6.11
 7     5      7 <dbl [5]> <dbl [5]> <int>         3.81     2.50      3.2    9.11
 8     5      8 <dbl [5]> <dbl [5]> <int>         4.12     3.13      4      5.51
 9     5      9 <dbl [5]> <dbl [5]> <int>         3.51     2.30      2.4    5.53
10     5     10 <dbl [5]> <dbl [5]> <int>         4.54     2.28      6.4    4.22
# ℹ 39,990 more rows
# ℹ 23 more variables: s2_exp <dbl>, s2_geo <dbl>, t_crit <dbl>,
#   mu_lcl_unif <dbl>, mu_ucl_unif <dbl>, mu_lcl_exp <dbl>, mu_ucl_exp <dbl>,
#   mu_lcl_geo <dbl>, mu_ucl_geo <dbl>, cov_mu_unif <lgl>, cov_mu_exp <lgl>,
#   cov_mu_geo <lgl>, chi_lower <dbl>, chi_upper <dbl>, var_lcl_unif <dbl>,
#   var_ucl_unif <dbl>, var_lcl_exp <dbl>, var_ucl_exp <dbl>,
#   var_lcl_geo <dbl>, var_ucl_geo <dbl>, cov_var_unif <lgl>, …

table_mu <- sim6 %>%
  group_by(n) %>%
  summarize(
    `Y ~ UNIF(0,8)` = mean(cov_mu_unif),
    `Y ~ EXP(beta = 4)` = mean(cov_mu_exp),
    `Y ~ GEO(p = 0.2)` = mean(cov_mu_geo)
  ) %>%
  mutate(n = paste0("n = ", n)) %>%
  bind_rows(
    tibble(n = "True mu", `Y ~ UNIF(0,8)` = 4, `Y ~ EXP(beta = 4)` = 4, `Y ~ GEO(p = 0.2)` = 4),
    .
  )

table_var <- sim6 %>%
  group_by(n) %>%
  summarize(
    `Y ~ UNIF(0,8)` = mean(cov_var_unif),
    `Y ~ EXP(beta = 4)` = mean(cov_var_exp),
    `Y ~ GEO(p = 0.2)` = mean(cov_var_geo)
  ) %>%
  mutate(n = paste0("n = ", n)) %>%
  bind_rows(
    tibble(n = "True sigma^2", `Y ~ UNIF(0,8)` = round(64/12, 2), `Y ~ EXP(beta = 4)` = 16, `Y ~ GEO(p = 0.2)` = 20),
    .
  )

kable(table_mu, align = "c", caption = "Coverage of mu")

Coverage of mu
n	Y ~ UNIF(0,8)	Y ~ EXP(beta = 4)	Y ~ GEO(p = 0.2)
True mu	4.0000	4.0000	4.0000
n = 5	0.9355	0.8823	0.8823
n = 20	0.9469	0.9210	0.9191
n = 50	0.9449	0.9362	0.9363
n = 100	0.9510	0.9481	0.9419

kable(table_var, align = "c", caption = "Coverage of sigma^2")

Coverage of sigma^2
n	Y ~ UNIF(0,8)	Y ~ EXP(beta = 4)	Y ~ GEO(p = 0.2)
True sigma^2	5.3300	16.0000	20.0000
n = 5	0.9837	0.8257	0.8216
n = 20	0.9957	0.7330	0.7261
n = 50	0.9965	0.7074	0.7035
n = 100	0.9970	0.7010	0.6875

The first thing I notice looking at these tables is that as n increases coverage of sigma^2 actually decreases for the EXP and GEO distributions which is interesting. I also notice that EXP and GEO distributions do poorly for small values of n (for the mean), but as increases it gets closer to 95%, CLT showing up again. The UNIF distribution did pretty decently for n=5 with a 93% coverage, overall coverage stayed pretty much the same for UNIF distribution, I think probably because it’s not a skewed distribution like the GEO and EXP are.