Chapter 9.4

Question 1

library(purrrfect)


Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

t <- 5

(survival <- parameters(~n, ~beta,
                        c(20, 40, 80, 160, 320, 640), c(0.5, 2, 4))
  %>% add_trials(10000)
  %>% mutate(Ysample = pmap(list(n,beta), .f = \(n,b) rexp(n, rate = 1 / b)))
  %>% mutate(ybar = map_dbl(Ysample, mean),
             theta = exp(-t/beta))
  )

# A tibble: 180,000 × 6
       n  beta .trial Ysample     ybar     theta
   <dbl> <dbl>  <dbl> <list>     <dbl>     <dbl>
 1    20   0.5      1 <dbl [20]> 0.382 0.0000454
 2    20   0.5      2 <dbl [20]> 0.611 0.0000454
 3    20   0.5      3 <dbl [20]> 0.335 0.0000454
 4    20   0.5      4 <dbl [20]> 0.586 0.0000454
 5    20   0.5      5 <dbl [20]> 0.480 0.0000454
 6    20   0.5      6 <dbl [20]> 0.607 0.0000454
 7    20   0.5      7 <dbl [20]> 0.498 0.0000454
 8    20   0.5      8 <dbl [20]> 0.520 0.0000454
 9    20   0.5      9 <dbl [20]> 0.369 0.0000454
10    20   0.5     10 <dbl [20]> 0.645 0.0000454
# ℹ 179,990 more rows

(survival_with_ci <- survival
  %>% mutate(beta_hat = ybar,
             theta_hat = exp(-t / beta_hat),
             se_theta = (t / (beta_hat * sqrt(n)) * exp(-t / beta_hat)),
             lcl = theta_hat - 1.96 * se_theta,
             ucl = theta_hat + 1.96 * se_theta
             )
  %>% mutate(covers = ifelse(lcl < theta & ucl > theta, 1, 0))
  )

# A tibble: 180,000 × 12
       n  beta .trial Ysample  ybar   theta beta_hat theta_hat se_theta      lcl
   <dbl> <dbl>  <dbl> <list>  <dbl>   <dbl>    <dbl>     <dbl>    <dbl>    <dbl>
 1    20   0.5      1 <dbl>   0.382 4.54e-5    0.382   2.04e-6  5.97e-6 -9.66e-6
 2    20   0.5      2 <dbl>   0.611 4.54e-5    0.611   2.80e-4  5.12e-4 -7.23e-4
 3    20   0.5      3 <dbl>   0.335 4.54e-5    0.335   3.31e-7  1.10e-6 -1.83e-6
 4    20   0.5      4 <dbl>   0.586 4.54e-5    0.586   1.98e-4  3.77e-4 -5.41e-4
 5    20   0.5      5 <dbl>   0.480 4.54e-5    0.480   2.97e-5  6.92e-5 -1.06e-4
 6    20   0.5      6 <dbl>   0.607 4.54e-5    0.607   2.66e-4  4.90e-4 -6.94e-4
 7    20   0.5      7 <dbl>   0.498 4.54e-5    0.498   4.38e-5  9.83e-5 -1.49e-4
 8    20   0.5      8 <dbl>   0.520 4.54e-5    0.520   6.68e-5  1.44e-4 -2.15e-4
 9    20   0.5      9 <dbl>   0.369 4.54e-5    0.369   1.32e-6  4.00e-6 -6.52e-6
10    20   0.5     10 <dbl>   0.645 4.54e-5    0.645   4.30e-4  7.46e-4 -1.03e-3
# ℹ 179,990 more rows
# ℹ 2 more variables: ucl <dbl>, covers <dbl>

(survival_coverage <- survival_with_ci
  %>% summarize(coverage = mean(covers),
                .by = c(n, beta)
                )
  )

# A tibble: 18 × 3
       n  beta coverage
   <dbl> <dbl>    <dbl>
 1    20   0.5    0.733
 2    20   2      0.883
 3    20   4      0.925
 4    40   0.5    0.794
 5    40   2      0.917
 6    40   4      0.941
 7    80   0.5    0.841
 8    80   2      0.930
 9    80   4      0.940
10   160   0.5    0.874
11   160   2      0.941
12   160   4      0.946
13   320   0.5    0.901
14   320   2      0.944
15   320   4      0.947
16   640   0.5    0.929
17   640   2      0.947
18   640   4      0.948

(ggplot(data = survival_coverage)
  + geom_line(aes(x = n, y = coverage))
  + theme_classic(base_size = 10)
  + labs(y = 'Coverage')
  + geom_hline(aes(yintercept = 0.95), linetype = 2)
  + facet_wrap(~beta, labeller = label_both)
  )

As beta increases, we see that we reach asymptotic 95% coverage faster (as we increase beta, we achieve 95% coverage at smaller values of n).

Question 2

library(purrrfect)
library(tidyverse)

#loglik.alpha function
loglik.alpha <- function(alpha, yvals) {
  n <- length(yvals)
  ybar <- mean(yvals)
  
  n * (alpha - 1) * mean(log(yvals)) - 
    n * lgamma(alpha) - 
    n * alpha * log(ybar / alpha) -
    n * alpha
}

#get.mle function
get.mle<- function(yvals) {
  opt <- optimize(
    f = loglik.alpha,
    interval = c(0.001, 15),
    yvals = yvals,
    maximum = TRUE
  )
  
  alpha.hat <- opt$maximum
  beta.hat <- mean(yvals) / alpha.hat
  
  c(alpha.hat = alpha.hat, beta.hat = beta.hat)
}

#verify the function works
my.y<- c(6.690889, 1.989313, 4.884504, 2.142505, 4.177150)
get.mle(my.y)

alpha.hat  beta.hat 
4.8222767 0.8246877

# simulation
alpha <- 4
beta <- 2

(sim_gamma <- parameters(~n, 
                         c(5, 20, 50, 100, 150, 200))
  %>% add_trials(10000)
  %>% mutate(Ysample = map(n, \(n) rgamma(n, shape = alpha, scale = beta)))
  %>% mutate(mle = map(Ysample, get.mle),
             alphahat = map_dbl(mle, 1),
             betahat = map_dbl(mle, 2))
  )

# A tibble: 60,000 × 6
       n .trial Ysample   mle       alphahat betahat
   <dbl>  <dbl> <list>    <list>       <dbl>   <dbl>
 1     5      1 <dbl [5]> <dbl [2]>    10.7    0.807
 2     5      2 <dbl [5]> <dbl [2]>     3.37   2.61 
 3     5      3 <dbl [5]> <dbl [2]>    10.9    0.652
 4     5      4 <dbl [5]> <dbl [2]>    10.8    0.777
 5     5      5 <dbl [5]> <dbl [2]>     6.60   1.05 
 6     5      6 <dbl [5]> <dbl [2]>    15.0    0.462
 7     5      7 <dbl [5]> <dbl [2]>     2.58   3.93 
 8     5      8 <dbl [5]> <dbl [2]>    11.3    0.774
 9     5      9 <dbl [5]> <dbl [2]>     5.82   0.948
10     5     10 <dbl [5]> <dbl [2]>     3.02   2.08 
# ℹ 59,990 more rows

# plot alpha
(ggplot(data = sim_gamma)
  + geom_histogram(aes(x = alphahat),
                   binwidth = 0.3,
                   center = 0.15,
                   fill = 'grey',
                   color = 'black')
  + geom_vline(xintercept = alpha,
               lty = 2,
               linewidth = 1,
               col = 'blue')
  + facet_wrap(~n, labeller = label_both)
  + coord_cartesian(xlim = c(0,15))
  + theme_classic(base_size = 18)
  + labs(x = expression(hat(alpha)[MLE]),
         y = 'Count'
         )
  )

# plot beta
(ggplot(data = sim_gamma)
  + geom_histogram(aes(x = betahat),
                   binwidth = 0.1,
                   center = 0.05,
                   fill = 'grey',
                   col = 'black')
  + geom_vline(xintercept = beta,
               lty = 2,
               linewidth = 1,
               col = 'blue')
  + facet_wrap(~n, labeller = label_both)
  + coord_cartesian(xlim = c(0,5))
  + theme_classic(base_size = 18)
  + labs(x = expression(hat(beta)[MLE]),
         y = 'Count'
         )
  )

Question 3

library(purrrfect)
library(tidyverse)

tau <- function(p) p*(1-p)
crlb_tau <- function(p, n) {
  p * (1 - p) * (1 - 2*p)^2 / n
}

(sim_tau <- parameters(~n, ~p,
                       c(10, 20, 40, 80, 160, 320, 640, 1280),
                       c(0.1, 0.2, 0.3, 0.4, 0.5))
  %>% add_trials(10000)
  %>% mutate(Ysample = pmap(list(n, p), .f = \(n,p) rbinom(n, size = 1, prob = p)))
  %>% mutate(phat = map_dbl(Ysample, mean),
             tauhat = phat * (1 - phat),
             tau_true = tau(p),
             crlb_hat = crlb_tau(phat, n),
             lcl = tauhat - 1.96 * sqrt(crlb_hat),
             ucl = tauhat + 1.96 * sqrt(crlb_hat))
  %>% mutate(covers = ifelse(lcl <= tau_true & ucl >= tau_true, 1, 0))
  )

# A tibble: 400,000 × 11
       n     p .trial Ysample     phat tauhat tau_true crlb_hat     lcl   ucl
   <dbl> <dbl>  <dbl> <list>     <dbl>  <dbl>    <dbl>    <dbl>   <dbl> <dbl>
 1    10   0.1      1 <int [10]>   0     0        0.09  0        0      0    
 2    10   0.1      2 <int [10]>   0.1   0.09     0.09  0.00576 -0.0588 0.239
 3    10   0.1      3 <int [10]>   0     0        0.09  0        0      0    
 4    10   0.1      4 <int [10]>   0.2   0.16     0.09  0.00576  0.0112 0.309
 5    10   0.1      5 <int [10]>   0     0        0.09  0        0      0    
 6    10   0.1      6 <int [10]>   0.3   0.21     0.09  0.00336  0.0964 0.324
 7    10   0.1      7 <int [10]>   0.3   0.21     0.09  0.00336  0.0964 0.324
 8    10   0.1      8 <int [10]>   0.3   0.21     0.09  0.00336  0.0964 0.324
 9    10   0.1      9 <int [10]>   0.1   0.09     0.09  0.00576 -0.0588 0.239
10    10   0.1     10 <int [10]>   0.1   0.09     0.09  0.00576 -0.0588 0.239
# ℹ 399,990 more rows
# ℹ 1 more variable: covers <dbl>

Part A

(ggplot(data = sim_tau, aes(x = tauhat, y = after_stat(density)))
  + geom_histogram(binwidth = 0.0001,
                   fill = 'grey',
                   col = 'black')
  + geom_vline(aes(xintercept = tau_true),
               lty = 2,
               linewidth = 0.8,
               col = 'blue')
  + facet_grid(p~n, labeller = label_both, scales = 'free')
  + theme_classic(base_size = 10)
  + labs(x = expression(hat(tau)(p)),
         y = 'Density'
         )
  )

We see that as n increases, our distributions tighten (less variability) and become more normal. We also see that as p increases towards 0.5, it actually takes longer for asymptotic normality to kick in. When p = 0.5, it appears that we do not reach asymptotic normality and our distribution remains extremely left skewed.

Part B

(coverage <- sim_tau
 %>% summarize(coverage = mean(covers),
               .by = c(n,p))
 )

# A tibble: 40 × 3
       n     p coverage
   <dbl> <dbl>    <dbl>
 1    10   0.1    0.580
 2    10   0.2    0.775
 3    10   0.3    0.865
 4    10   0.4    0.752
 5    10   0.5    0.978
 6    20   0.1    0.841
 7    20   0.2    0.844
 8    20   0.3    0.855
 9    20   0.4    0.864
10    20   0.5    0.998
# ℹ 30 more rows

(ggplot(data = coverage)
  + geom_line(aes(x = n, y = coverage))
  + theme_classic(base_size = 18)
  + labs(y = 'Coverage')
  + geom_hline(aes(yintercept = 0.95), linetype = 2)
  + facet_wrap(~p, labeller = label_both)
  )

We see that the coverage results in part B tend to be worse when the histogram in part A is less normal (i.e. p = 0.4 has a very skewed distribution in Part A and it takes longer to reach 95% coverage in part B).

Part C

Since tau(p) = p(1-p) and p ranges from [0,1], tau ranges from 0 (when p 0.999…) to 0.25 (when p = 0.5).