Chapter 7.1

Question 5

library(tidyverse)
library(purrrfect)

dice <- c(1,2,3,4,5,6)
  
(mean_of_dice_rolls <- parameters(~n, c(20, 40, 80, 160))
  %>% add_trials(10000)
  %>% mutate(y_sample = map(n, ~sample(dice, .x, replace = TRUE)))
  %>% mutate(y_bar = map_dbl(y_sample, .f = mean))
)
# A tibble: 40,000 × 4
       n .trial y_sample   y_bar
   <dbl>  <dbl> <list>     <dbl>
 1    20      1 <dbl [20]>  3.3 
 2    20      2 <dbl [20]>  3.6 
 3    20      3 <dbl [20]>  3.25
 4    20      4 <dbl [20]>  3.45
 5    20      5 <dbl [20]>  3.3 
 6    20      6 <dbl [20]>  3.25
 7    20      7 <dbl [20]>  2.5 
 8    20      8 <dbl [20]>  3.35
 9    20      9 <dbl [20]>  2.9 
10    20     10 <dbl [20]>  3.1 
# ℹ 39,990 more rows
# Part A
(ggplot(data = mean_of_dice_rolls)
+ geom_histogram(aes(x = y_bar, y = after_stat(density)),
                 fill = 'goldenrod',
                 center = 0.025, binwidth = 0.05)
+ facet_wrap(~n, labeller = label_value)
+ theme_classic()
+ labs(title = 'Simulated sampling distributions of y_bar')
)

We are able to use map() instead of pmap() because we only have one parameter: n. The visualization reveals that as we increase our sample size n, the center remains unchanged but the variation/spread of the sampling distribution decreases.

# Part B
(mean_of_dice_rolls
  %>% summarize('Simulated E(y_bar)' = mean(y_bar),
                'Var(y_bar)' = var(y_bar), .by = n)
 %>% mutate('Analytic E(y_bar)' = 3.5)
 %>% mutate('Analytic Var(y_bar)' = (35/12)/n)
)
# A tibble: 4 × 5
      n `Simulated E(y_bar)` `Var(y_bar)` `Analytic E(y_bar)`
  <dbl>                <dbl>        <dbl>               <dbl>
1    20                 3.50       0.142                  3.5
2    40                 3.50       0.0730                 3.5
3    80                 3.50       0.0363                 3.5
4   160                 3.50       0.0189                 3.5
# ℹ 1 more variable: `Analytic Var(y_bar)` <dbl>

When looking at the aggregation table, we see that the means and variances are the same for the simulated and analytic calculations.

Question 6

library(tidyverse)
library(purrrfect)

(sum_of_pois <- parameters(~n, ~lambda,
                           c(2,5,10), c(0.5,1,1.5)
                           )
  %>% add_trials(10000)
  %>% mutate(y_sample = pmap(list(n, lambda), .f = \(n,l) rpois(n,l)))
  %>% mutate(u = map_dbl(y_sample, .f = sum))
  %>% mutate(F_u = ppois(u, (n*lambda)),
             Fhat = cume_dist(u), .by = c(n, lambda))
)
# A tibble: 90,000 × 7
       n lambda .trial y_sample      u   F_u  Fhat
   <dbl>  <dbl>  <dbl> <list>    <dbl> <dbl> <dbl>
 1     2    0.5      1 <int [2]>     3 0.981 0.980
 2     2    0.5      2 <int [2]>     0 0.368 0.371
 3     2    0.5      3 <int [2]>     0 0.368 0.371
 4     2    0.5      4 <int [2]>     3 0.981 0.980
 5     2    0.5      5 <int [2]>     0 0.368 0.371
 6     2    0.5      6 <int [2]>     2 0.920 0.917
 7     2    0.5      7 <int [2]>     0 0.368 0.371
 8     2    0.5      8 <int [2]>     0 0.368 0.371
 9     2    0.5      9 <int [2]>     2 0.920 0.917
10     2    0.5     10 <int [2]>     1 0.736 0.741
# ℹ 89,990 more rows
(ggplot(data = sum_of_pois, aes(x = u))
 + geom_step(aes(y = F_u, col = 'Analytic CDF'))
 + geom_step(aes(y = Fhat, col = 'Empirical CDF'))
 + facet_grid(n~lambda, labeller = label_both)
 + labs(y = expression(P(u)), 
        title = 'Simulated and analytic CDFs of sample sum from POI population', color = '')
 + theme_classic()
)