5.7 & 5.8: Multinomial, Bivariate Normal, and Simulations

Question 1

Part A.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(purrrfect)

Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate
library(GGally)

N <- 10000
(many_home_samples <- rmultinom(N, size = 50, prob = c(0.73,0.2, 0.07))
  %>% t()
  %>% data.frame()
  %>% setNames(c('Y1', 'Y2', 'Y3'))
) %>% head
  Y1 Y2 Y3
1 38  9  3
2 39  8  3
3 39  9  2
4 36  9  5
5 35 12  3
6 38  9  3
(ggpairs(many_home_samples,
         lower = list(continuous = wrap("points",
                                        alpha = 0.6,
                                        position = position_jitter(width = 0.2, height = 0.2)
                                        )
                      )
         )
  ) +theme_classic(base_size = 1)

(many_home_samples
  %>%count(Y1, Y2, sort = TRUE)
  %>% slice(1)
)
  Y1 Y2   n
1 37 10 314

The most common combination of (Y1, Y2) was the event where 37 of the sampled homes were family houses and 10 were apartments.

Part B.

sim_cov <- cov(many_home_samples$Y1, many_home_samples$Y2) %>% round(3)
analytic_cov <- -50*0.73*0.2

tibble(
  sim_cov = sim_cov,
  analytic_cov = analytic_cov
)
# A tibble: 1 × 2
  sim_cov analytic_cov
    <dbl>        <dbl>
1   -7.21         -7.3

The simulation study approximates the covariance very well, with the simulated and analytic covariances lying within approximately 0.1 (depending on the simulation) of each other.

Question 6

N <- 10000
(simstudy <- parameters(~sigx, ~sigy, ~rho,
                        c(0.5,1), c(0.5,1), c(-0.8, -0.4, 0, 0.4, 0.8)
                        )
  %>% add_trials(N)
  %>% mutate(z1 = rnorm(n()),
             z2 = rnorm(n()))
  %>% mutate(x_std = z1,
             y_std = rho*z1+sqrt(1-(rho^2))*z2)
  %>% mutate(x = sigx*x_std+5,
             y = sigy*y_std+5)
  ) %>% head
# A tibble: 6 × 10
   sigx  sigy   rho .trial     z1     z2  x_std   y_std     x     y
  <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl> <dbl> <dbl>
1   0.5   0.5  -0.8      1 -0.307 -0.219 -0.307  0.114   4.85  5.06
2   0.5   0.5  -0.8      2 -0.926 -0.362 -0.926  0.524   4.54  5.26
3   0.5   0.5  -0.8      3  0.706 -0.551  0.706 -0.895   5.35  4.55
4   0.5   0.5  -0.8      4 -0.119 -1.43  -0.119 -0.763   4.94  4.62
5   0.5   0.5  -0.8      5 -0.337 -0.511 -0.337 -0.0372  4.83  4.98
6   0.5   0.5  -0.8      6  1.39  -0.747  1.39  -1.56    5.70  4.22
(ggplot(data = simstudy) 
  + geom_point(aes(x = x, y = y), alpha = 0.5, size = 0.5)
  + geom_point(aes(x = 5, y= 5), col = 'red')
  + facet_grid(rho~sigx+sigy, labeller = label_both)
  + theme_bw()
)
Warning in geom_point(aes(x = 5, y = 5), col = "red"): All aesthetics have length 1, but the data has 200000 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

Question 7

N <- 10000

(simstudy_1 <- data.frame(x = rexp(N,1))
  %>% mutate(y = x + rexp(N,1))
) %>% head
          x         y
1 0.1488812 0.3580922
2 0.8677757 1.1195775
3 0.2762563 1.9954143
4 0.8810833 2.2242157
5 0.5809241 1.2253704
6 0.1663885 1.8102942
(ggplot(data = simstudy_1, aes(x = x, y = y))
  + geom_point(size = 0.2, alpha = 0.5)
  + geom_smooth(method = 'loess', se = FALSE,
                aes(color = 'Simulated E(Y|X)'),
                linewidth = 0.7)
  + geom_function(fun = \(x) x+1,
                  aes(color = 'Analytic E(Y|X)'),
                  linewidth = 0.7)
  +labs(color = '')
  +theme_classic()
  )
`geom_smooth()` using formula = 'y ~ x'

Estimating Part E.

(simstudy_1
 %>% summarize(prob1 = mean(x<=1 & y<=2),
               prob2 = mean(x <=1 & y <= 0.5))
 )
   prob1  prob2
1 0.4872 0.0877

Estimating Part F.

(simstudy_1
 %>% filter(x <= 1)
 %>% summarize(prob = mean(y <= 2))
 )
       prob
1 0.7801441

Estimating Part G.

(simstudy_1
 %>% summarize(prob = mean(y< x^2))
 )
    prob
1 0.2339

Estimating Part H.

(simstudy_1
 %>% summarise(E_Y = mean(y),
               E_X = mean(x))
)
       E_Y      E_X
1 2.018515 1.015409

Estimating Part I.

(simstudy_1
 %>% summarise('cov(x,y)' = cov(x,y),
               'cor(x,y)' = cor(x,y))
 )
  cov(x,y)  cor(x,y)
1  1.02562 0.7149623

Estimating Part J.

(simstudy_1
 %>% summarise('E(Y-X)' = mean(y-x),
               'Var(Y-X)' = var(y-x))
 )
    E(Y-X)  Var(Y-X)
1 1.003106 0.9678443

Estimating Part K.

model <- lm(y ~ x, data = simstudy_1)
summary(model)

Call:
lm(formula = y ~ x, data = simstudy_1)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.0152 -0.7071 -0.3069  0.3832 12.9836 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1.016704   0.013884   73.23   <2e-16 ***
x           0.986608   0.009649  102.25   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9837 on 9998 degrees of freedom
Multiple R-squared:  0.5112,    Adjusted R-squared:  0.5111 
F-statistic: 1.045e+04 on 1 and 9998 DF,  p-value: < 2.2e-16