Chapter 7.7

Question 5

Population 1: UNIF(0,8)

library(tidyverse)
library(purrrfect)

N <- 1000
(unif_sims <- parameters(~n, 
                         c(5, 10, 20, 40, 80, 160)
                         )
  %>% add_trials(N)
  %>% mutate(YSample = map(n, .f = \(n) runif(n,0,8)))
  %>% mutate(Ybar = map_dbl(YSample, mean))
  %>% mutate(fU = dnorm(Ybar, mean = 4, sd = sqrt(8^2/12)/sqrt(n)), 
             FU = pnorm(Ybar, mean = 4, sd = sqrt(8^2/12)/sqrt(n)))
  %>% mutate(Fhat = cume_dist(Ybar),
             .by = n)
)
# A tibble: 6,000 × 7
       n .trial YSample    Ybar    fU     FU  Fhat
   <dbl>  <dbl> <list>    <dbl> <dbl>  <dbl> <dbl>
 1     5      1 <dbl [5]>  3.54 0.350 0.329  0.322
 2     5      2 <dbl [5]>  4.65 0.318 0.734  0.722
 3     5      3 <dbl [5]>  2.81 0.198 0.124  0.127
 4     5      4 <dbl [5]>  3.61 0.360 0.353  0.352
 5     5      5 <dbl [5]>  3.74 0.374 0.400  0.394
 6     5      6 <dbl [5]>  2.66 0.166 0.0966 0.103
 7     5      7 <dbl [5]>  4.41 0.357 0.654  0.652
 8     5      8 <dbl [5]>  3.90 0.384 0.461  0.451
 9     5      9 <dbl [5]>  4.29 0.372 0.610  0.609
10     5     10 <dbl [5]>  4.75 0.296 0.767  0.754
# ℹ 5,990 more rows
(ggplot(data = unif_sims, aes(x = Ybar))
 + geom_histogram(aes(y = after_stat(density)),
                  fill = 'goldenrod', binwidth = 0.2)
 + geom_line(aes(y = fU), col = 'cornflowerblue')
 + facet_wrap(~n, labeller = label_both)
 + labs(x = expression(bar(Y)))
 + theme_classic()
 )

(ggplot(data = unif_sims, aes(x = Ybar))
 + geom_step(aes(y = FU, col = 'Analytic Normal CDF'))
 + geom_step(aes(y = Fhat, col = 'Empirical CDF'))
 + facet_wrap(~n, labeller = label_both)
 + labs(color = '',
        x = expression(bar(Y)),
        y = 'CDF')
 + theme_classic()
 )

Population 2: GAMMA(2,2)

#|message: false
#|error: false

library(tidyverse)
library(purrrfect)

N <- 10000
(GAM_sims <- parameters(~n, 
                        c(5,10,20,40,80,160)
                        )
  %>% add_trials(N)
  %>% mutate(Y_sample = map(n, .f = \(n) rgamma(n, shape = 2, scale = 2)))
  %>% mutate(Ybar = map_dbl(Y_sample, mean))
  %>% mutate(fU = dnorm(Ybar, mean = 4, sd = sqrt(8)/sqrt(n)),
             FU = pnorm(Ybar, mean = 4, sd = sqrt(8)/sqrt(n)),
             Fhat = cume_dist(Ybar),
             .by = n)
)
# A tibble: 60,000 × 7
       n .trial Y_sample   Ybar      fU     FU   Fhat
   <dbl>  <dbl> <list>    <dbl>   <dbl>  <dbl>  <dbl>
 1     5      1 <dbl [5]>  4.55 0.287   0.669  0.701 
 2     5      2 <dbl [5]>  2.10 0.101   0.0661 0.0392
 3     5      3 <dbl [5]>  5.01 0.229   0.787  0.796 
 4     5      4 <dbl [5]>  3.25 0.265   0.277  0.302 
 5     5      5 <dbl [5]>  7.59 0.00568 0.998  0.990 
 6     5      6 <dbl [5]>  5.77 0.118   0.920  0.911 
 7     5      7 <dbl [5]>  6.26 0.0643  0.963  0.946 
 8     5      8 <dbl [5]>  6.08 0.0819  0.950  0.935 
 9     5      9 <dbl [5]>  4.46 0.295   0.643  0.678 
10     5     10 <dbl [5]>  3.23 0.262   0.271  0.292 
# ℹ 59,990 more rows
(ggplot(data = GAM_sims, aes(x = Ybar))
 + geom_histogram(aes(y = after_stat(density)),
                  fill = 'goldenrod', binwidth = 0.2)
 + geom_line(aes(y = fU), col = 'cornflowerblue')
 + facet_wrap(~n, labeller = label_both)
 + labs(x = expression(bar(Y)))
 + theme_classic()
 )

(ggplot(data = GAM_sims, aes(x = Ybar))
 + geom_step(aes(y = FU, col = 'Analytic Normal CDF'))
 + geom_step(aes(y = Fhat, col = 'empirical CDF'))
 + facet_wrap(~n, labeller = label_both)
 + labs(color = '',
        x = expression(bar(Y)),
        y = 'CDF')
 + theme_classic()
 )

Population 3: POI(4)

library(tidyverse)
library(purrrfect)

N <- 10000
(pois_sims <- parameters(~n,
                        c(5,10,20,40,80,160))
  %>% add_trials(N)
  %>% mutate(Y_sample = map(n, .f = \(n) rpois(n, lambda = 4)))
  %>% mutate(Ybar = map_dbl(Y_sample, mean))
  %>% mutate(fU = dnorm(Ybar, mean = 4, sd = 2/sqrt(n)),
             FU = pnorm(Ybar, mean = 4, sd = 2/sqrt(n)),
             Fhat = cume_dist(Ybar),
             .by = n)
  )
# A tibble: 60,000 × 7
       n .trial Y_sample   Ybar      fU     FU  Fhat
   <dbl>  <dbl> <list>    <dbl>   <dbl>  <dbl> <dbl>
 1     5      1 <int [5]>   3   0.239   0.132  0.162
 2     5      2 <int [5]>   4.4 0.404   0.673  0.721
 3     5      3 <int [5]>   2.8 0.181   0.0899 0.107
 4     5      4 <int [5]>   3.6 0.404   0.327  0.383
 5     5      5 <int [5]>   3.6 0.404   0.327  0.383
 6     5      6 <int [5]>   6.8 0.00332 0.999  0.998
 7     5      7 <int [5]>   3.8 0.435   0.412  0.476
 8     5      8 <int [5]>   3.8 0.435   0.412  0.476
 9     5      9 <int [5]>   3.2 0.299   0.186  0.223
10     5     10 <int [5]>   3.4 0.356   0.251  0.299
# ℹ 59,990 more rows
(ggplot(data = pois_sims, aes(x = Ybar))
 + geom_histogram(aes(y = after_stat(density)),
                  fill = 'goldenrod', binwidth = 0.2)
 + geom_line(aes(y = fU), col = 'cornflowerblue')
 + facet_wrap(~n, labeller = label_both)
 + labs(x = expression(bar(Y)))
 + theme_classic()
   )

(ggplot(data = pois_sims, aes(x = Ybar))
 + geom_step(aes(y = FU, col = 'Analytic Normal CDF'))
 + geom_step(aes(y = Fhat, col = 'empirical CDF'))
 + facet_wrap(~n, labeller = label_both)
 + labs(col = '',
        x = expression(bar(Y)),
        y = 'CDF')
 + theme_classic()
 )

Summary of Results

Overall, it appears that normality kicks in the “fastest” for the UNIF distribution, as obvious “normality” patterns begins to occur around n = 20. In general, Ybar appears “normal” around n=40 regardless of the distribution of the parent population.