Chapter 8.2

Question 5

library(tidyverse)
library(purrrfect)

N <- 10000
theta <- 3

(many_beta_CIs <- parameters(~n, c(5,10,20,40,80,160,320))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) rbeta(n,theta,1)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             Yn = map_dbl(Ysample, max),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2) / sqrt(n))
  %>% mutate(UCL_exact = log(0.025)/(n*log(Yn)),
             LCL_exact = log(0.975)/(n*log(Yn)))
  %>% mutate(LCL_asymptotic = (Ybar - 1.96*SE) / (1 - (Ybar - 1.96*SE)),
             UCL_asymptotic = (Ybar + 1.96*SE) / (1 - (Ybar + 1.96*SE)))
  %>% mutate(exact_covers = ifelse(LCL_exact < theta & UCL_exact > theta, 1, 0),
             asymptotic_covers = ifelse(LCL_asymptotic < theta & UCL_asymptotic > theta, 1, 0))
)

# A tibble: 70,000 × 13
       n .trial Ysample    Ybar    Yn      S2     SE UCL_exact LCL_exact
   <dbl>  <dbl> <list>    <dbl> <dbl>   <dbl>  <dbl>     <dbl>     <dbl>
 1     5      1 <dbl [5]> 0.590 0.846 0.0585  0.108       4.42    0.0303
 2     5      2 <dbl [5]> 0.671 0.965 0.0420  0.0917     20.8     0.143 
 3     5      3 <dbl [5]> 0.821 0.998 0.0378  0.0869    348.      2.39  
 4     5      4 <dbl [5]> 0.683 0.973 0.0527  0.103      26.7     0.183 
 5     5      5 <dbl [5]> 0.665 0.835 0.0184  0.0606      4.09    0.0281
 6     5      6 <dbl [5]> 0.743 0.981 0.0347  0.0833     38.2     0.262 
 7     5      7 <dbl [5]> 0.891 0.952 0.00206 0.0203     14.9     0.102 
 8     5      8 <dbl [5]> 0.784 0.908 0.0276  0.0743      7.62    0.0523
 9     5      9 <dbl [5]> 0.766 0.997 0.0626  0.112     264.      1.81  
10     5     10 <dbl [5]> 0.826 0.996 0.0725  0.120     182.      1.25  
# ℹ 69,990 more rows
# ℹ 4 more variables: LCL_asymptotic <dbl>, UCL_asymptotic <dbl>,
#   exact_covers <dbl>, asymptotic_covers <dbl>

(betas_coverage <- many_beta_CIs
  %>% group_by(n)
  %>% summarize(exact_coverage = mean(exact_covers),
                asymptotic_coverage = mean(asymptotic_covers))
)

# A tibble: 7 × 3
      n exact_coverage asymptotic_coverage
  <dbl>          <dbl>               <dbl>
1     5          0.949               0.790
2    10          0.950               0.904
3    20          0.952               0.925
4    40          0.950               0.938
5    80          0.95                0.946
6   160          0.946               0.945
7   320          0.952               0.950

(ggplot(data = betas_coverage)
  + geom_line(aes(x = n, y = exact_coverage, color = 'exact'))
  + geom_line(aes(x = n, y = asymptotic_coverage, color = 'asymptotic'))
  + geom_hline(aes(yintercept = 0.95))
  + scale_color_discrete(name = '')
  + ylab('coverage')
  + theme_classic()
  + ggtitle('Simulated coverage of nominal 95% CIs for simulated betas')
  )

We see that the exact CI has a coverage rate of around 95% for every value of n. However, the asymptotic CI has too small of a coverage for small values of n and doesn’t meet the 95% threshold until roughly n = 100.

Question 6

library(purrrfect)
library(tidyverse)

N <- 10000

# UNIF(0,8)
(many_UNIF_CIs <- parameters(~n, c(5,20,50,100))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) runif(n,0,8)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2/n))
  %>% mutate(LCL_mean = Ybar - qt(0.975, n-1) * SE,
             UCL_mean = Ybar + qt(0.975, n-1)*SE,
             LCL_var = ((n-1)*S2) / qchisq(0.975, n-1),
             UCL_var = ((n-1)*S2) / qchisq(0.025, n-1))
  %>% mutate(asymptotic_covers_mean = ifelse(LCL_mean < 4 & UCL_mean > 4, 1, 0),
             asymptotic_covers_var = ifelse(LCL_var < 16/3 & UCL_var > 16/3, 1, 0))
)

# A tibble: 40,000 × 12
       n .trial Ysample    Ybar    S2    SE LCL_mean UCL_mean LCL_var UCL_var
   <dbl>  <dbl> <list>    <dbl> <dbl> <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
 1     5      1 <dbl [5]>  4.06 4.92  0.992    1.31      6.81   1.76    40.6 
 2     5      2 <dbl [5]>  2.55 5.66  1.06    -0.400     5.51   2.03    46.7 
 3     5      3 <dbl [5]>  3.24 8.70  1.32    -0.421     6.90   3.12    71.8 
 4     5      4 <dbl [5]>  1.14 0.562 0.335    0.210     2.07   0.202    4.64
 5     5      5 <dbl [5]>  2.55 5.78  1.08    -0.438     5.53   2.08    47.8 
 6     5      6 <dbl [5]>  3.37 5.78  1.08     0.389     6.36   2.07    47.7 
 7     5      7 <dbl [5]>  6.13 5.30  1.03     3.27      8.99   1.90    43.8 
 8     5      8 <dbl [5]>  3.58 7.25  1.20     0.239     6.93   2.60    59.9 
 9     5      9 <dbl [5]>  3.66 3.55  0.843    1.32      6.00   1.27    29.3 
10     5     10 <dbl [5]>  3.56 3.21  0.801    1.34      5.79   1.15    26.5 
# ℹ 39,990 more rows
# ℹ 2 more variables: asymptotic_covers_mean <dbl>, asymptotic_covers_var <dbl>

(UNIFs_coverage <- many_UNIF_CIs
  %>% group_by(n)
  %>% summarize(asymptotic_coverage_mean = mean(asymptotic_covers_mean),
                asymptotic_coverage_var = mean(asymptotic_covers_var))
)

# A tibble: 4 × 3
      n asymptotic_coverage_mean asymptotic_coverage_var
  <dbl>                    <dbl>                   <dbl>
1     5                    0.936                   0.986
2    20                    0.950                   0.997
3    50                    0.944                   0.997
4   100                    0.948                   0.998

## EXP(B = 4)
(many_EXP_CIs <- parameters(~n, c(5,20,50,100))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) rexp(n, rate = 1/4)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2/n))
  %>% mutate(LCL_mean = Ybar - qt(0.975, n-1) * SE,
             UCL_mean = Ybar + qt(0.975, n-1)*SE,
             LCL_var = ((n-1)*S2) / qchisq(0.975, n-1),
             UCL_var = ((n-1)*S2) / qchisq(0.025, n-1))
  %>% mutate(asymptotic_covers_mean = ifelse(LCL_mean < 4 & UCL_mean > 4, 1, 0),
             asymptotic_covers_var = ifelse(LCL_var < 16 & UCL_var > 16, 1, 0))
)

# A tibble: 40,000 × 12
       n .trial Ysample    Ybar     S2    SE LCL_mean UCL_mean LCL_var UCL_var
   <dbl>  <dbl> <list>    <dbl>  <dbl> <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
 1     5      1 <dbl [5]> 0.514  0.177 0.188 -0.00819     1.04  0.0634    1.46
 2     5      2 <dbl [5]> 3.03   2.41  0.694  1.10        4.96  0.865    19.9 
 3     5      3 <dbl [5]> 1.68   3.12  0.790 -0.516       3.87  1.12     25.8 
 4     5      4 <dbl [5]> 3.95  21.1   2.05  -1.75        9.66  7.57    174.  
 5     5      5 <dbl [5]> 4.71  20.3   2.02  -0.887      10.3   7.29    168.  
 6     5      6 <dbl [5]> 2.11   0.803 0.401  0.993       3.22  0.288     6.63
 7     5      7 <dbl [5]> 3.05   8.27  1.29  -0.525       6.62  2.97     68.3 
 8     5      8 <dbl [5]> 4.53   5.95  1.09   1.50        7.56  2.14     49.2 
 9     5      9 <dbl [5]> 3.69  14.5   1.71  -1.04        8.43  5.22    120.  
10     5     10 <dbl [5]> 4.30  46.1   3.04  -4.13       12.7  16.6     381.  
# ℹ 39,990 more rows
# ℹ 2 more variables: asymptotic_covers_mean <dbl>, asymptotic_covers_var <dbl>

(EXPs_coverage <- many_EXP_CIs
  %>% group_by(n)
  %>% summarize(asymptotic_coverage_mean = mean(asymptotic_covers_mean),
                asymptotic_coverage_var = mean(asymptotic_covers_var))
  )

# A tibble: 4 × 3
      n asymptotic_coverage_mean asymptotic_coverage_var
  <dbl>                    <dbl>                   <dbl>
1     5                    0.877                   0.821
2    20                    0.918                   0.727
3    50                    0.935                   0.697
4   100                    0.945                   0.696

## GEO(p = 0.2)
(many_GEO_CIs <- parameters(~n, c(5,20,50,100))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) rgeom(n, 0.2)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2/n))
  %>% mutate(LCL_mean = Ybar - qt(0.975, n-1) * SE,
             UCL_mean = Ybar + qt(0.975, n-1)*SE,
             LCL_var = ((n-1)*S2) / qchisq(0.975, n-1),
             UCL_var = ((n-1)*S2) / qchisq(0.025, n-1))
  %>%  mutate(asymptotic_covers_mean = ifelse(LCL_mean < 4 & UCL_mean > 4, 1, 0),
           asymptotic_covers_var = ifelse(LCL_var < (1-0.2) / 0.2^2 & UCL_var > (1-0.2) / 0.2^2, 1, 0))
)

# A tibble: 40,000 × 12
       n .trial Ysample    Ybar    S2    SE LCL_mean UCL_mean LCL_var UCL_var
   <dbl>  <dbl> <list>    <dbl> <dbl> <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
 1     5      1 <int [5]>   5.6  25.3 2.25    -0.645    11.8    9.08    209. 
 2     5      2 <int [5]>   6.4  66.3 3.64    -3.71     16.5   23.8     547. 
 3     5      3 <int [5]>   3.2  16.7 1.83    -1.87      8.27   5.99    138. 
 4     5      4 <int [5]>   5.6  43.8 2.96    -2.62     13.8   15.7     362. 
 5     5      5 <int [5]>   1.8   1.7 0.583    0.181     3.42   0.610    14.0
 6     5      6 <int [5]>   3    12.5 1.58    -1.39      7.39   4.49    103. 
 7     5      7 <int [5]>   3.4  11.8 1.54    -0.865     7.67   4.24     97.4
 8     5      8 <int [5]>   2.6  22.8 2.14    -3.33      8.53   8.18    188. 
 9     5      9 <int [5]>   3.2  19.2 1.96    -2.24      8.64   6.89    159. 
10     5     10 <int [5]>   7.2  55.7 3.34    -2.07     16.5   20.0     460. 
# ℹ 39,990 more rows
# ℹ 2 more variables: asymptotic_covers_mean <dbl>, asymptotic_covers_var <dbl>

(GEOs_coverage <- many_GEO_CIs
  %>% group_by(n)
  %>% summarize(asymptotic_coverage_mean = mean(asymptotic_covers_mean),
                asymptotic_coverage_var = mean(asymptotic_covers_var))
  )

# A tibble: 4 × 3
      n asymptotic_coverage_mean asymptotic_coverage_var
  <dbl>                    <dbl>                   <dbl>
1     5                    0.881                   0.825
2    20                    0.918                   0.731
3    50                    0.937                   0.699
4   100                    0.944                   0.692

(UNIFs_coverage <- UNIFs_coverage %>% mutate(distribution = 'UNIF(0,8)'))

# A tibble: 4 × 4
      n asymptotic_coverage_mean asymptotic_coverage_var distribution
  <dbl>                    <dbl>                   <dbl> <chr>       
1     5                    0.936                   0.986 UNIF(0,8)   
2    20                    0.950                   0.997 UNIF(0,8)   
3    50                    0.944                   0.997 UNIF(0,8)   
4   100                    0.948                   0.998 UNIF(0,8)

(EXPs_coverage <- EXPs_coverage %>% mutate(distribution = 'EXP(B = 4)'))

# A tibble: 4 × 4
      n asymptotic_coverage_mean asymptotic_coverage_var distribution
  <dbl>                    <dbl>                   <dbl> <chr>       
1     5                    0.877                   0.821 EXP(B = 4)  
2    20                    0.918                   0.727 EXP(B = 4)  
3    50                    0.935                   0.697 EXP(B = 4)  
4   100                    0.945                   0.696 EXP(B = 4)

(GEOs_coverage <- GEOs_coverage %>% mutate(distribution = 'GEO(p = 0.2)'))

# A tibble: 4 × 4
      n asymptotic_coverage_mean asymptotic_coverage_var distribution
  <dbl>                    <dbl>                   <dbl> <chr>       
1     5                    0.881                   0.825 GEO(p = 0.2)
2    20                    0.918                   0.731 GEO(p = 0.2)
3    50                    0.937                   0.699 GEO(p = 0.2)
4   100                    0.944                   0.692 GEO(p = 0.2)

(mu_table <- bind_rows(
  UNIFs_coverage
    %>% select(n, asymptotic_coverage_mean)
    %>% rename('Y_i ~ UNIF(0,8)' = asymptotic_coverage_mean),
  
  EXPs_coverage
    %>% select(n, asymptotic_coverage_mean)
    %>% rename('Y_i ~ EXP(B = 4)' = asymptotic_coverage_mean),
  
  GEOs_coverage
    %>% select(n, asymptotic_coverage_mean)
    %>% rename('Y_i ~ GEO(p = 0.2)' = asymptotic_coverage_mean),
)
  %>% arrange(n)
  %>% mutate(across(-n, ~round(.,3)))
)

# A tibble: 12 × 4
       n `Y_i ~ UNIF(0,8)` `Y_i ~ EXP(B = 4)` `Y_i ~ GEO(p = 0.2)`
   <dbl>             <dbl>              <dbl>                <dbl>
 1     5             0.936             NA                   NA    
 2     5            NA                  0.877               NA    
 3     5            NA                 NA                    0.881
 4    20             0.95              NA                   NA    
 5    20            NA                  0.918               NA    
 6    20            NA                 NA                    0.918
 7    50             0.944             NA                   NA    
 8    50            NA                  0.935               NA    
 9    50            NA                 NA                    0.937
10   100             0.948             NA                   NA    
11   100            NA                  0.945               NA    
12   100            NA                 NA                    0.944

library(knitr)
kable(mu_table)

n	Y_i ~ UNIF(0,8)	Y_i ~ EXP(B = 4)	Y_i ~ GEO(p = 0.2)
5	0.936	NA	NA
5	NA	0.877	NA
5	NA	NA	0.881
20	0.950	NA	NA
20	NA	0.918	NA
20	NA	NA	0.918
50	0.944	NA	NA
50	NA	0.935	NA
50	NA	NA	0.937
100	0.948	NA	NA
100	NA	0.945	NA
100	NA	NA	0.944

(variance_table <- bind_rows(
  UNIFs_coverage
    %>% select(n, asymptotic_coverage_var)
    %>% rename('Y_i ~ UNIF(0,8)' = asymptotic_coverage_var),
  
  EXPs_coverage
    %>% select(n, asymptotic_coverage_var)
    %>% rename('Y_i ~ EXP(B = 4)' = asymptotic_coverage_var),
  
  GEOs_coverage
    %>% select(n, asymptotic_coverage_var)
    %>% rename('Y_i ~ GEO(p = 0.2)' = asymptotic_coverage_var),
)
  %>% arrange(n)
  %>% mutate(across(-n, ~round(.,3)))
)

# A tibble: 12 × 4
       n `Y_i ~ UNIF(0,8)` `Y_i ~ EXP(B = 4)` `Y_i ~ GEO(p = 0.2)`
   <dbl>             <dbl>              <dbl>                <dbl>
 1     5             0.986             NA                   NA    
 2     5            NA                  0.821               NA    
 3     5            NA                 NA                    0.825
 4    20             0.997             NA                   NA    
 5    20            NA                  0.727               NA    
 6    20            NA                 NA                    0.731
 7    50             0.997             NA                   NA    
 8    50            NA                  0.697               NA    
 9    50            NA                 NA                    0.699
10   100             0.998             NA                   NA    
11   100            NA                  0.696               NA    
12   100            NA                 NA                    0.692

library(knitr)
kable(variance_table)

n	Y_i ~ UNIF(0,8)	Y_i ~ EXP(B = 4)	Y_i ~ GEO(p = 0.2)
5	0.986	NA	NA
5	NA	0.821	NA
5	NA	NA	0.825
20	0.997	NA	NA
20	NA	0.727	NA
20	NA	NA	0.731
50	0.997	NA	NA
50	NA	0.697	NA
50	NA	NA	0.699
100	0.998	NA	NA
100	NA	0.696	NA
100	NA	NA	0.692

When looking specifically at our means table, n = 5 is problematic for both the EXP and GEO distribution. When switching gears to the variance table, we see extreme under-coverage as n increases for both EXP and GEO, with n = 100 having concerningly low variance. For both the mean and variance CIs, the consequences are the least severe for the UNIF distribution. Lastly, we see that the variance CIs are most affected by the consequences of the normality assumption.