Chapter 8.2

Question 5

library(tidyverse)
library(purrrfect)

N <- 10000
theta <- 3

(many_beta_CIs <- parameters(~n, c(5,10,20,40,80,160,320))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) rbeta(n,theta,1)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             Yn = map_dbl(Ysample, max),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2) / sqrt(n))
  %>% mutate(UCL_exact = log(0.025)/(n*log(Yn)),
             LCL_exact = log(0.975)/(n*log(Yn)))
  %>% mutate(LCL_asymptotic = (Ybar - 1.96*SE) / (1 - (Ybar - 1.96*SE)),
             UCL_asymptotic = (Ybar + 1.96*SE) / (1 - (Ybar + 1.96*SE)))
  %>% mutate(exact_covers = ifelse(LCL_exact < theta & UCL_exact > theta, 1, 0),
             asymptotic_covers = ifelse(LCL_asymptotic < theta & UCL_asymptotic > theta, 1, 0))
)
# A tibble: 70,000 × 13
       n .trial Ysample    Ybar    Yn      S2     SE UCL_exact LCL_exact
   <dbl>  <dbl> <list>    <dbl> <dbl>   <dbl>  <dbl>     <dbl>     <dbl>
 1     5      1 <dbl [5]> 0.590 0.846 0.0585  0.108       4.42    0.0303
 2     5      2 <dbl [5]> 0.671 0.965 0.0420  0.0917     20.8     0.143 
 3     5      3 <dbl [5]> 0.821 0.998 0.0378  0.0869    348.      2.39  
 4     5      4 <dbl [5]> 0.683 0.973 0.0527  0.103      26.7     0.183 
 5     5      5 <dbl [5]> 0.665 0.835 0.0184  0.0606      4.09    0.0281
 6     5      6 <dbl [5]> 0.743 0.981 0.0347  0.0833     38.2     0.262 
 7     5      7 <dbl [5]> 0.891 0.952 0.00206 0.0203     14.9     0.102 
 8     5      8 <dbl [5]> 0.784 0.908 0.0276  0.0743      7.62    0.0523
 9     5      9 <dbl [5]> 0.766 0.997 0.0626  0.112     264.      1.81  
10     5     10 <dbl [5]> 0.826 0.996 0.0725  0.120     182.      1.25  
# ℹ 69,990 more rows
# ℹ 4 more variables: LCL_asymptotic <dbl>, UCL_asymptotic <dbl>,
#   exact_covers <dbl>, asymptotic_covers <dbl>
(betas_coverage <- many_beta_CIs
  %>% group_by(n)
  %>% summarize(exact_coverage = mean(exact_covers),
                asymptotic_coverage = mean(asymptotic_covers))
)
# A tibble: 7 × 3
      n exact_coverage asymptotic_coverage
  <dbl>          <dbl>               <dbl>
1     5          0.949               0.790
2    10          0.950               0.904
3    20          0.952               0.925
4    40          0.950               0.938
5    80          0.95                0.946
6   160          0.946               0.945
7   320          0.952               0.950
(ggplot(data = betas_coverage)
  + geom_line(aes(x = n, y = exact_coverage, color = 'exact'))
  + geom_line(aes(x = n, y = asymptotic_coverage, color = 'asymptotic'))
  + geom_hline(aes(yintercept = 0.95))
  + scale_color_discrete(name = '')
  + ylab('coverage')
  + theme_classic()
  + ggtitle('Simulated coverage of nominal 95% CIs for simulated betas')
  )

We see that the exact CI has a coverage rate of around 95% for every value of n. However, the asymptotic CI has too small of a coverage for small values of n and doesn’t meet the 95% threshold until roughly n = 100.

Question 6

library(purrrfect)
library(tidyverse)

N <- 10000

# UNIF(0,8)
(many_UNIF_CIs <- parameters(~n, c(5,20,50,100))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) runif(n,0,8)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2/n))
  %>% mutate(LCL_mean = Ybar - qt(0.975, n-1) * SE,
             UCL_mean = Ybar + qt(0.975, n-1)*SE,
             LCL_var = ((n-1)*S2) / qchisq(0.975, n-1),
             UCL_var = ((n-1)*S2) / qchisq(0.025, n-1))
  %>% mutate(asymptotic_covers_mean = ifelse(LCL_mean < 4 & UCL_mean > 4, 1, 0),
             asymptotic_covers_var = ifelse(LCL_var < 16/3 & UCL_var > 16/3, 1, 0))
)
# A tibble: 40,000 × 12
       n .trial Ysample    Ybar    S2    SE LCL_mean UCL_mean LCL_var UCL_var
   <dbl>  <dbl> <list>    <dbl> <dbl> <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
 1     5      1 <dbl [5]>  4.06 4.92  0.992    1.31      6.81   1.76    40.6 
 2     5      2 <dbl [5]>  2.55 5.66  1.06    -0.400     5.51   2.03    46.7 
 3     5      3 <dbl [5]>  3.24 8.70  1.32    -0.421     6.90   3.12    71.8 
 4     5      4 <dbl [5]>  1.14 0.562 0.335    0.210     2.07   0.202    4.64
 5     5      5 <dbl [5]>  2.55 5.78  1.08    -0.438     5.53   2.08    47.8 
 6     5      6 <dbl [5]>  3.37 5.78  1.08     0.389     6.36   2.07    47.7 
 7     5      7 <dbl [5]>  6.13 5.30  1.03     3.27      8.99   1.90    43.8 
 8     5      8 <dbl [5]>  3.58 7.25  1.20     0.239     6.93   2.60    59.9 
 9     5      9 <dbl [5]>  3.66 3.55  0.843    1.32      6.00   1.27    29.3 
10     5     10 <dbl [5]>  3.56 3.21  0.801    1.34      5.79   1.15    26.5 
# ℹ 39,990 more rows
# ℹ 2 more variables: asymptotic_covers_mean <dbl>, asymptotic_covers_var <dbl>
(UNIFs_coverage <- many_UNIF_CIs
  %>% group_by(n)
  %>% summarize(asymptotic_coverage_mean = mean(asymptotic_covers_mean),
                asymptotic_coverage_var = mean(asymptotic_covers_var))
)
# A tibble: 4 × 3
      n asymptotic_coverage_mean asymptotic_coverage_var
  <dbl>                    <dbl>                   <dbl>
1     5                    0.936                   0.986
2    20                    0.950                   0.997
3    50                    0.944                   0.997
4   100                    0.948                   0.998
## EXP(B = 4)
(many_EXP_CIs <- parameters(~n, c(5,20,50,100))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) rexp(n, rate = 1/4)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2/n))
  %>% mutate(LCL_mean = Ybar - qt(0.975, n-1) * SE,
             UCL_mean = Ybar + qt(0.975, n-1)*SE,
             LCL_var = ((n-1)*S2) / qchisq(0.975, n-1),
             UCL_var = ((n-1)*S2) / qchisq(0.025, n-1))
  %>% mutate(asymptotic_covers_mean = ifelse(LCL_mean < 4 & UCL_mean > 4, 1, 0),
             asymptotic_covers_var = ifelse(LCL_var < 16 & UCL_var > 16, 1, 0))
)
# A tibble: 40,000 × 12
       n .trial Ysample    Ybar     S2    SE LCL_mean UCL_mean LCL_var UCL_var
   <dbl>  <dbl> <list>    <dbl>  <dbl> <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
 1     5      1 <dbl [5]> 0.514  0.177 0.188 -0.00819     1.04  0.0634    1.46
 2     5      2 <dbl [5]> 3.03   2.41  0.694  1.10        4.96  0.865    19.9 
 3     5      3 <dbl [5]> 1.68   3.12  0.790 -0.516       3.87  1.12     25.8 
 4     5      4 <dbl [5]> 3.95  21.1   2.05  -1.75        9.66  7.57    174.  
 5     5      5 <dbl [5]> 4.71  20.3   2.02  -0.887      10.3   7.29    168.  
 6     5      6 <dbl [5]> 2.11   0.803 0.401  0.993       3.22  0.288     6.63
 7     5      7 <dbl [5]> 3.05   8.27  1.29  -0.525       6.62  2.97     68.3 
 8     5      8 <dbl [5]> 4.53   5.95  1.09   1.50        7.56  2.14     49.2 
 9     5      9 <dbl [5]> 3.69  14.5   1.71  -1.04        8.43  5.22    120.  
10     5     10 <dbl [5]> 4.30  46.1   3.04  -4.13       12.7  16.6     381.  
# ℹ 39,990 more rows
# ℹ 2 more variables: asymptotic_covers_mean <dbl>, asymptotic_covers_var <dbl>
(EXPs_coverage <- many_EXP_CIs
  %>% group_by(n)
  %>% summarize(asymptotic_coverage_mean = mean(asymptotic_covers_mean),
                asymptotic_coverage_var = mean(asymptotic_covers_var))
  )
# A tibble: 4 × 3
      n asymptotic_coverage_mean asymptotic_coverage_var
  <dbl>                    <dbl>                   <dbl>
1     5                    0.877                   0.821
2    20                    0.918                   0.727
3    50                    0.935                   0.697
4   100                    0.945                   0.696
## GEO(p = 0.2)
(many_GEO_CIs <- parameters(~n, c(5,20,50,100))
  %>% add_trials(N)
  %>% mutate(Ysample = map(n, .f = \(n) rgeom(n, 0.2)))
  %>% mutate(Ybar = map_dbl(Ysample, mean),
             S2 = map_dbl(Ysample, var),
             SE = sqrt(S2/n))
  %>% mutate(LCL_mean = Ybar - qt(0.975, n-1) * SE,
             UCL_mean = Ybar + qt(0.975, n-1)*SE,
             LCL_var = ((n-1)*S2) / qchisq(0.975, n-1),
             UCL_var = ((n-1)*S2) / qchisq(0.025, n-1))
  %>%  mutate(asymptotic_covers_mean = ifelse(LCL_mean < 4 & UCL_mean > 4, 1, 0),
           asymptotic_covers_var = ifelse(LCL_var < (1-0.2) / 0.2^2 & UCL_var > (1-0.2) / 0.2^2, 1, 0))
)
# A tibble: 40,000 × 12
       n .trial Ysample    Ybar    S2    SE LCL_mean UCL_mean LCL_var UCL_var
   <dbl>  <dbl> <list>    <dbl> <dbl> <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
 1     5      1 <int [5]>   5.6  25.3 2.25    -0.645    11.8    9.08    209. 
 2     5      2 <int [5]>   6.4  66.3 3.64    -3.71     16.5   23.8     547. 
 3     5      3 <int [5]>   3.2  16.7 1.83    -1.87      8.27   5.99    138. 
 4     5      4 <int [5]>   5.6  43.8 2.96    -2.62     13.8   15.7     362. 
 5     5      5 <int [5]>   1.8   1.7 0.583    0.181     3.42   0.610    14.0
 6     5      6 <int [5]>   3    12.5 1.58    -1.39      7.39   4.49    103. 
 7     5      7 <int [5]>   3.4  11.8 1.54    -0.865     7.67   4.24     97.4
 8     5      8 <int [5]>   2.6  22.8 2.14    -3.33      8.53   8.18    188. 
 9     5      9 <int [5]>   3.2  19.2 1.96    -2.24      8.64   6.89    159. 
10     5     10 <int [5]>   7.2  55.7 3.34    -2.07     16.5   20.0     460. 
# ℹ 39,990 more rows
# ℹ 2 more variables: asymptotic_covers_mean <dbl>, asymptotic_covers_var <dbl>
(GEOs_coverage <- many_GEO_CIs
  %>% group_by(n)
  %>% summarize(asymptotic_coverage_mean = mean(asymptotic_covers_mean),
                asymptotic_coverage_var = mean(asymptotic_covers_var))
  )
# A tibble: 4 × 3
      n asymptotic_coverage_mean asymptotic_coverage_var
  <dbl>                    <dbl>                   <dbl>
1     5                    0.881                   0.825
2    20                    0.918                   0.731
3    50                    0.937                   0.699
4   100                    0.944                   0.692
(UNIFs_coverage <- UNIFs_coverage %>% mutate(distribution = 'UNIF(0,8)'))
# A tibble: 4 × 4
      n asymptotic_coverage_mean asymptotic_coverage_var distribution
  <dbl>                    <dbl>                   <dbl> <chr>       
1     5                    0.936                   0.986 UNIF(0,8)   
2    20                    0.950                   0.997 UNIF(0,8)   
3    50                    0.944                   0.997 UNIF(0,8)   
4   100                    0.948                   0.998 UNIF(0,8)   
(EXPs_coverage <- EXPs_coverage %>% mutate(distribution = 'EXP(B = 4)'))
# A tibble: 4 × 4
      n asymptotic_coverage_mean asymptotic_coverage_var distribution
  <dbl>                    <dbl>                   <dbl> <chr>       
1     5                    0.877                   0.821 EXP(B = 4)  
2    20                    0.918                   0.727 EXP(B = 4)  
3    50                    0.935                   0.697 EXP(B = 4)  
4   100                    0.945                   0.696 EXP(B = 4)  
(GEOs_coverage <- GEOs_coverage %>% mutate(distribution = 'GEO(p = 0.2)'))
# A tibble: 4 × 4
      n asymptotic_coverage_mean asymptotic_coverage_var distribution
  <dbl>                    <dbl>                   <dbl> <chr>       
1     5                    0.881                   0.825 GEO(p = 0.2)
2    20                    0.918                   0.731 GEO(p = 0.2)
3    50                    0.937                   0.699 GEO(p = 0.2)
4   100                    0.944                   0.692 GEO(p = 0.2)
(mu_table <- bind_rows(
  UNIFs_coverage
    %>% select(n, asymptotic_coverage_mean)
    %>% rename('Y_i ~ UNIF(0,8)' = asymptotic_coverage_mean),
  
  EXPs_coverage
    %>% select(n, asymptotic_coverage_mean)
    %>% rename('Y_i ~ EXP(B = 4)' = asymptotic_coverage_mean),
  
  GEOs_coverage
    %>% select(n, asymptotic_coverage_mean)
    %>% rename('Y_i ~ GEO(p = 0.2)' = asymptotic_coverage_mean),
)
  %>% arrange(n)
  %>% mutate(across(-n, ~round(.,3)))
)
# A tibble: 12 × 4
       n `Y_i ~ UNIF(0,8)` `Y_i ~ EXP(B = 4)` `Y_i ~ GEO(p = 0.2)`
   <dbl>             <dbl>              <dbl>                <dbl>
 1     5             0.936             NA                   NA    
 2     5            NA                  0.877               NA    
 3     5            NA                 NA                    0.881
 4    20             0.95              NA                   NA    
 5    20            NA                  0.918               NA    
 6    20            NA                 NA                    0.918
 7    50             0.944             NA                   NA    
 8    50            NA                  0.935               NA    
 9    50            NA                 NA                    0.937
10   100             0.948             NA                   NA    
11   100            NA                  0.945               NA    
12   100            NA                 NA                    0.944
library(knitr)
kable(mu_table)
n Y_i ~ UNIF(0,8) Y_i ~ EXP(B = 4) Y_i ~ GEO(p = 0.2)
5 0.936 NA NA
5 NA 0.877 NA
5 NA NA 0.881
20 0.950 NA NA
20 NA 0.918 NA
20 NA NA 0.918
50 0.944 NA NA
50 NA 0.935 NA
50 NA NA 0.937
100 0.948 NA NA
100 NA 0.945 NA
100 NA NA 0.944
(variance_table <- bind_rows(
  UNIFs_coverage
    %>% select(n, asymptotic_coverage_var)
    %>% rename('Y_i ~ UNIF(0,8)' = asymptotic_coverage_var),
  
  EXPs_coverage
    %>% select(n, asymptotic_coverage_var)
    %>% rename('Y_i ~ EXP(B = 4)' = asymptotic_coverage_var),
  
  GEOs_coverage
    %>% select(n, asymptotic_coverage_var)
    %>% rename('Y_i ~ GEO(p = 0.2)' = asymptotic_coverage_var),
)
  %>% arrange(n)
  %>% mutate(across(-n, ~round(.,3)))
)
# A tibble: 12 × 4
       n `Y_i ~ UNIF(0,8)` `Y_i ~ EXP(B = 4)` `Y_i ~ GEO(p = 0.2)`
   <dbl>             <dbl>              <dbl>                <dbl>
 1     5             0.986             NA                   NA    
 2     5            NA                  0.821               NA    
 3     5            NA                 NA                    0.825
 4    20             0.997             NA                   NA    
 5    20            NA                  0.727               NA    
 6    20            NA                 NA                    0.731
 7    50             0.997             NA                   NA    
 8    50            NA                  0.697               NA    
 9    50            NA                 NA                    0.699
10   100             0.998             NA                   NA    
11   100            NA                  0.696               NA    
12   100            NA                 NA                    0.692
library(knitr)
kable(variance_table)
n Y_i ~ UNIF(0,8) Y_i ~ EXP(B = 4) Y_i ~ GEO(p = 0.2)
5 0.986 NA NA
5 NA 0.821 NA
5 NA NA 0.825
20 0.997 NA NA
20 NA 0.727 NA
20 NA NA 0.731
50 0.997 NA NA
50 NA 0.697 NA
50 NA NA 0.699
100 0.998 NA NA
100 NA 0.696 NA
100 NA NA 0.692

When looking specifically at our means table, n = 5 is problematic for both the EXP and GEO distribution. When switching gears to the variance table, we see extreme under-coverage as n increases for both EXP and GEO, with n = 100 having concerningly low variance. For both the mean and variance CIs, the consequences are the least severe for the UNIF distribution. Lastly, we see that the variance CIs are most affected by the consequences of the normality assumption.