Question 1
Attaching package: 'purrrfect'
The following objects are masked from 'package:base':
replicate, tabulate
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 4.0.0 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
t <- 5
(survival <- parameters (~ n, ~ beta,
c (20 , 40 , 80 , 160 , 320 , 640 ), c (0.5 , 2 , 4 ))
%>% add_trials (10000 )
%>% mutate (Ysample = pmap (list (n,beta), .f = \(n,b) rexp (n, rate = 1 / b)))
%>% mutate (ybar = map_dbl (Ysample, mean),
theta = exp (- t/ beta))
)
# A tibble: 180,000 × 6
n beta .trial Ysample ybar theta
<dbl> <dbl> <dbl> <list> <dbl> <dbl>
1 20 0.5 1 <dbl [20]> 0.382 0.0000454
2 20 0.5 2 <dbl [20]> 0.611 0.0000454
3 20 0.5 3 <dbl [20]> 0.335 0.0000454
4 20 0.5 4 <dbl [20]> 0.586 0.0000454
5 20 0.5 5 <dbl [20]> 0.480 0.0000454
6 20 0.5 6 <dbl [20]> 0.607 0.0000454
7 20 0.5 7 <dbl [20]> 0.498 0.0000454
8 20 0.5 8 <dbl [20]> 0.520 0.0000454
9 20 0.5 9 <dbl [20]> 0.369 0.0000454
10 20 0.5 10 <dbl [20]> 0.645 0.0000454
# ℹ 179,990 more rows
(survival_with_ci <- survival
%>% mutate (beta_hat = ybar,
theta_hat = exp (- t / beta_hat),
se_theta = (t / (beta_hat * sqrt (n)) * exp (- t / beta_hat)),
lcl = theta_hat - 1.96 * se_theta,
ucl = theta_hat + 1.96 * se_theta
)
%>% mutate (covers = ifelse (lcl < theta & ucl > theta, 1 , 0 ))
)
# A tibble: 180,000 × 12
n beta .trial Ysample ybar theta beta_hat theta_hat se_theta lcl
<dbl> <dbl> <dbl> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 20 0.5 1 <dbl> 0.382 4.54e-5 0.382 2.04e-6 5.97e-6 -9.66e-6
2 20 0.5 2 <dbl> 0.611 4.54e-5 0.611 2.80e-4 5.12e-4 -7.23e-4
3 20 0.5 3 <dbl> 0.335 4.54e-5 0.335 3.31e-7 1.10e-6 -1.83e-6
4 20 0.5 4 <dbl> 0.586 4.54e-5 0.586 1.98e-4 3.77e-4 -5.41e-4
5 20 0.5 5 <dbl> 0.480 4.54e-5 0.480 2.97e-5 6.92e-5 -1.06e-4
6 20 0.5 6 <dbl> 0.607 4.54e-5 0.607 2.66e-4 4.90e-4 -6.94e-4
7 20 0.5 7 <dbl> 0.498 4.54e-5 0.498 4.38e-5 9.83e-5 -1.49e-4
8 20 0.5 8 <dbl> 0.520 4.54e-5 0.520 6.68e-5 1.44e-4 -2.15e-4
9 20 0.5 9 <dbl> 0.369 4.54e-5 0.369 1.32e-6 4.00e-6 -6.52e-6
10 20 0.5 10 <dbl> 0.645 4.54e-5 0.645 4.30e-4 7.46e-4 -1.03e-3
# ℹ 179,990 more rows
# ℹ 2 more variables: ucl <dbl>, covers <dbl>
(survival_coverage <- survival_with_ci
%>% summarize (coverage = mean (covers),
.by = c (n, beta)
)
)
# A tibble: 18 × 3
n beta coverage
<dbl> <dbl> <dbl>
1 20 0.5 0.733
2 20 2 0.883
3 20 4 0.925
4 40 0.5 0.794
5 40 2 0.917
6 40 4 0.941
7 80 0.5 0.841
8 80 2 0.930
9 80 4 0.940
10 160 0.5 0.874
11 160 2 0.941
12 160 4 0.946
13 320 0.5 0.901
14 320 2 0.944
15 320 4 0.947
16 640 0.5 0.929
17 640 2 0.947
18 640 4 0.948
(ggplot (data = survival_coverage)
+ geom_line (aes (x = n, y = coverage))
+ theme_classic (base_size = 10 )
+ labs (y = 'Coverage' )
+ geom_hline (aes (yintercept = 0.95 ), linetype = 2 )
+ facet_wrap (~ beta, labeller = label_both)
)
As beta increases, we see that we reach asymptotic 95% coverage faster (as we increase beta, we achieve 95% coverage at smaller values of n).
Question 2
library (purrrfect)
library (tidyverse)
#loglik.alpha function
loglik.alpha <- function (alpha, yvals) {
n <- length (yvals)
ybar <- mean (yvals)
n * (alpha - 1 ) * mean (log (yvals)) -
n * lgamma (alpha) -
n * alpha * log (ybar / alpha) -
n * alpha
}
#get.mle function
get.mle<- function (yvals) {
opt <- optimize (
f = loglik.alpha,
interval = c (0.001 , 15 ),
yvals = yvals,
maximum = TRUE
)
alpha.hat <- opt$ maximum
beta.hat <- mean (yvals) / alpha.hat
c (alpha.hat = alpha.hat, beta.hat = beta.hat)
}
#verify the function works
my.y<- c (6.690889 , 1.989313 , 4.884504 , 2.142505 , 4.177150 )
get.mle (my.y)
alpha.hat beta.hat
4.8222767 0.8246877
# simulation
alpha <- 4
beta <- 2
(sim_gamma <- parameters (~ n,
c (5 , 20 , 50 , 100 , 150 , 200 ))
%>% add_trials (10000 )
%>% mutate (Ysample = map (n, \(n) rgamma (n, shape = alpha, scale = beta)))
%>% mutate (mle = map (Ysample, get.mle),
alphahat = map_dbl (mle, 1 ),
betahat = map_dbl (mle, 2 ))
)
# A tibble: 60,000 × 6
n .trial Ysample mle alphahat betahat
<dbl> <dbl> <list> <list> <dbl> <dbl>
1 5 1 <dbl [5]> <dbl [2]> 10.7 0.807
2 5 2 <dbl [5]> <dbl [2]> 3.37 2.61
3 5 3 <dbl [5]> <dbl [2]> 10.9 0.652
4 5 4 <dbl [5]> <dbl [2]> 10.8 0.777
5 5 5 <dbl [5]> <dbl [2]> 6.60 1.05
6 5 6 <dbl [5]> <dbl [2]> 15.0 0.462
7 5 7 <dbl [5]> <dbl [2]> 2.58 3.93
8 5 8 <dbl [5]> <dbl [2]> 11.3 0.774
9 5 9 <dbl [5]> <dbl [2]> 5.82 0.948
10 5 10 <dbl [5]> <dbl [2]> 3.02 2.08
# ℹ 59,990 more rows
# plot alpha
(ggplot (data = sim_gamma)
+ geom_histogram (aes (x = alphahat),
binwidth = 0.3 ,
center = 0.15 ,
fill = 'grey' ,
color = 'black' )
+ geom_vline (xintercept = alpha,
lty = 2 ,
linewidth = 1 ,
col = 'blue' )
+ facet_wrap (~ n, labeller = label_both)
+ coord_cartesian (xlim = c (0 ,15 ))
+ theme_classic (base_size = 18 )
+ labs (x = expression (hat (alpha)[MLE]),
y = 'Count'
)
)
# plot beta
(ggplot (data = sim_gamma)
+ geom_histogram (aes (x = betahat),
binwidth = 0.1 ,
center = 0.05 ,
fill = 'grey' ,
col = 'black' )
+ geom_vline (xintercept = beta,
lty = 2 ,
linewidth = 1 ,
col = 'blue' )
+ facet_wrap (~ n, labeller = label_both)
+ coord_cartesian (xlim = c (0 ,5 ))
+ theme_classic (base_size = 18 )
+ labs (x = expression (hat (beta)[MLE]),
y = 'Count'
)
)
Question 3
library (purrrfect)
library (tidyverse)
tau <- function (p) p* (1 - p)
crlb_tau <- function (p, n) {
p * (1 - p) * (1 - 2 * p)^ 2 / n
}
(sim_tau <- parameters (~ n, ~ p,
c (10 , 20 , 40 , 80 , 160 , 320 , 640 , 1280 ),
c (0.1 , 0.2 , 0.3 , 0.4 , 0.5 ))
%>% add_trials (10000 )
%>% mutate (Ysample = pmap (list (n, p), .f = \(n,p) rbinom (n, size = 1 , prob = p)))
%>% mutate (phat = map_dbl (Ysample, mean),
tauhat = phat * (1 - phat),
tau_true = tau (p),
crlb_hat = crlb_tau (phat, n),
lcl = tauhat - 1.96 * sqrt (crlb_hat),
ucl = tauhat + 1.96 * sqrt (crlb_hat))
%>% mutate (covers = ifelse (lcl <= tau_true & ucl >= tau_true, 1 , 0 ))
)
# A tibble: 400,000 × 11
n p .trial Ysample phat tauhat tau_true crlb_hat lcl ucl
<dbl> <dbl> <dbl> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 10 0.1 1 <int [10]> 0 0 0.09 0 0 0
2 10 0.1 2 <int [10]> 0.1 0.09 0.09 0.00576 -0.0588 0.239
3 10 0.1 3 <int [10]> 0 0 0.09 0 0 0
4 10 0.1 4 <int [10]> 0.2 0.16 0.09 0.00576 0.0112 0.309
5 10 0.1 5 <int [10]> 0 0 0.09 0 0 0
6 10 0.1 6 <int [10]> 0.3 0.21 0.09 0.00336 0.0964 0.324
7 10 0.1 7 <int [10]> 0.3 0.21 0.09 0.00336 0.0964 0.324
8 10 0.1 8 <int [10]> 0.3 0.21 0.09 0.00336 0.0964 0.324
9 10 0.1 9 <int [10]> 0.1 0.09 0.09 0.00576 -0.0588 0.239
10 10 0.1 10 <int [10]> 0.1 0.09 0.09 0.00576 -0.0588 0.239
# ℹ 399,990 more rows
# ℹ 1 more variable: covers <dbl>
Part A
(ggplot (data = sim_tau, aes (x = tauhat, y = after_stat (density)))
+ geom_histogram (binwidth = 0.0001 ,
fill = 'grey' ,
col = 'black' )
+ geom_vline (aes (xintercept = tau_true),
lty = 2 ,
linewidth = 0.8 ,
col = 'blue' )
+ facet_grid (p~ n, labeller = label_both, scales = 'free' )
+ theme_classic (base_size = 10 )
+ labs (x = expression (hat (tau)(p)),
y = 'Density'
)
)
We see that as n increases, our distributions tighten (less variability) and become more normal. We also see that as p increases towards 0.5, it actually takes longer for asymptotic normality to kick in. When p = 0.5, it appears that we do not reach asymptotic normality and our distribution remains extremely left skewed.
Part B
(coverage <- sim_tau
%>% summarize (coverage = mean (covers),
.by = c (n,p))
)
# A tibble: 40 × 3
n p coverage
<dbl> <dbl> <dbl>
1 10 0.1 0.580
2 10 0.2 0.775
3 10 0.3 0.865
4 10 0.4 0.752
5 10 0.5 0.978
6 20 0.1 0.841
7 20 0.2 0.844
8 20 0.3 0.855
9 20 0.4 0.864
10 20 0.5 0.998
# ℹ 30 more rows
(ggplot (data = coverage)
+ geom_line (aes (x = n, y = coverage))
+ theme_classic (base_size = 18 )
+ labs (y = 'Coverage' )
+ geom_hline (aes (yintercept = 0.95 ), linetype = 2 )
+ facet_wrap (~ p, labeller = label_both)
)
We see that the coverage results in part B tend to be worse when the histogram in part A is less normal (i.e. p = 0.4 has a very skewed distribution in Part A and it takes longer to reach 95% coverage in part B).
Part C
Since tau(p) = p(1-p) and p ranges from [0,1], tau ranges from 0 (when p 0.999…) to 0.25 (when p = 0.5).