PS 7.4

5 - For a grid of n∈{4,8,15} and σ∈{1,2,3}, verify that ((n-1) S^2)/σ2 ∼χ_(n-1)^2 by simulating 10,000 realizations of this statistic for each parameter combination in the grid and plotting the simulated densities superimposed with the analytic densities. Additionally, use these simulation results to create a table of the simulated mean and variance of S^2 compared to their analytic counterparts, to verify that your computations from 4B are correct.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(purrrfect)


Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate

library(ggh4x)

N <- 10000

chi_sq_sample <- parameters(~n, ~sigma,
                            c(4, 8, 15), c(1, 2, 3)
) %>% add_trials(N) %>%
  mutate(y_sample = pmap(list(n, sigma), \(nn, ss) rnorm(nn, 0, ss))) %>%
  mutate(S2 = map_dbl(y_sample, var),
         scaled_S2 = pmap_dbl(list(S2, n, sigma), \(s2, nn, ss) (nn-1) * s2 / ss^2)
  )


chi_sq_sample <- chi_sq_sample %>%
  mutate(f_chi = pmap_dbl(list(scaled_S2, n), \(x, nn) dchisq(x, df = nn-1)))

ggplot(data = chi_sq_sample, aes(x = scaled_S2)) +
  geom_histogram(aes(y = after_stat(density)), fill = 'goldenrod', bins = 50, alpha = 0.6) +
  geom_line(aes(y = f_chi), col = 'cornflowerblue') +
  facet_grid(n ~ sigma, labeller = label_both, scales = 'free_y') +
  theme_classic(base_size = 16) +
  labs(
    x = expression((n-1)*S^2 / sigma^2),
    y = "Density",
    title = "Simulated and Analytic Densities of ((n-1)S^2)/σ^2"
  )

summary_table <- chi_sq_sample %>%
  group_by(n, sigma) %>%
  summarise(
    mean_sim = mean(S2),                
    var_sim  = var(S2),                  
    mean_analytic = sigma^2,             
    var_analytic  = 2*sigma^4 / (n-1)
  )

Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.

`summarise()` has grouped output by 'n', 'sigma'. You can override using the
`.groups` argument.

summary_table

# A tibble: 90,000 × 6
# Groups:   n, sigma [9]
       n sigma mean_sim var_sim mean_analytic var_analytic
   <dbl> <dbl>    <dbl>   <dbl>         <dbl>        <dbl>
 1     4     1    1.000   0.681             1        0.667
 2     4     1    1.000   0.681             1        0.667
 3     4     1    1.000   0.681             1        0.667
 4     4     1    1.000   0.681             1        0.667
 5     4     1    1.000   0.681             1        0.667
 6     4     1    1.000   0.681             1        0.667
 7     4     1    1.000   0.681             1        0.667
 8     4     1    1.000   0.681             1        0.667
 9     4     1    1.000   0.681             1        0.667
10     4     1    1.000   0.681             1        0.667
# ℹ 89,990 more rows

6 - Pick a single-parameter parent population (exponential, Poisson, Bernoulli, geometric). Simulate 1,000 replications across a parameter grid involving n∈{10,20,30} and the parameter of that population. Create scatterplots of S^2 vs Y ̅ for each combination in your grid. Verify that for these non-normal parent populations that S^2 and Y ̅ are not independent.

library(tidyverse)
library(purrrfect)


N <- 1000

many_exp_samples <- parameters(~n, ~lambda,
                               c(10, 20, 30), c(0.5, 1, 2)     
) %>%
  add_trials(N) %>%
  mutate(
    ysample = pmap(list(n, lambda), \(nn, lam) rexp(nn, lam)),  
    ybar = map_dbl(ysample, mean),                        
    S2 = map_dbl(ysample, var)                                 
  )

library(ggh4x)
ggplot(data = many_exp_samples) + 
  geom_point(aes(x = ybar, y = S2),
             shape='.')+ 
  labs(x = expression(bar(Y)),
       y = expression(S^2),
       title='Plots of sample mean vs sample variance')+
  facet_nested(lambda~n, labeller = label_both, scale = 'free_y') + 
  theme_classic()