5.7 & 5.8: Multinomial, Bivariate Normal, and Simulations

Question 1

Part A.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(purrrfect)

Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate
library(GGally)

N <- 10000
(many_home_samples <- rmultinom(N, size = 50, prob = c(0.73,0.2, 0.07))
  %>% t()
  %>% data.frame()
  %>% setNames(c('Y1', 'Y2', 'Y3'))
) %>% head
  Y1 Y2 Y3
1 38  7  5
2 32 14  4
3 39  3  8
4 38  8  4
5 33 14  3
6 38  8  4
(ggpairs(many_home_samples,
         lower = list(continuous = wrap("points",
                                        alpha = 0.6,
                                        position = position_jitter(width = 0.2, height = 0.2)
                                        )
                      )
         )
  ) +theme_classic(base_size = 1)

(many_home_samples
  %>%count(Y1, Y2, sort = TRUE)
  %>% slice(1)
)
  Y1 Y2   n
1 38  9 322

The most common combination of (Y1, Y2) was the event where 37 of the sampled homes were family houses and 10 were apartments.

Part B.

sim_cov <- cov(many_home_samples$Y1, many_home_samples$Y2) %>% round(3)
analytic_cov <- -50*0.73*0.2

tibble(
  sim_cov = sim_cov,
  analytic_cov = analytic_cov
)
# A tibble: 1 × 2
  sim_cov analytic_cov
    <dbl>        <dbl>
1   -7.23         -7.3

The simulation study approximates the covariance very well, with the simulated and analytic covariances lying within approximately 0.1 (depending on the simulation) of each other.

Question 6

N <- 10000
(simstudy <- parameters(~sigx, ~sigy, ~rho,
                        c(0.5,1), c(0.5,1), c(-0.8, -0.4, 0, 0.4, 0.8)
                        )
  %>% add_trials(N)
  %>% mutate(z1 = rnorm(n()),
             z2 = rnorm(n()))
  %>% mutate(x_std = z1,
             y_std = rho*z1+sqrt(1-(rho^2))*z2)
  %>% mutate(x = sigx*x_std+5,
             y = sigy*y_std+5)
  ) %>% head
# A tibble: 6 × 10
   sigx  sigy   rho .trial      z1     z2   x_std   y_std     x     y
  <dbl> <dbl> <dbl>  <dbl>   <dbl>  <dbl>   <dbl>   <dbl> <dbl> <dbl>
1   0.5   0.5  -0.8      1  0.692   1.00   0.692   0.0462  5.35  5.02
2   0.5   0.5  -0.8      2 -0.525  -0.503 -0.525   0.118   4.74  5.06
3   0.5   0.5  -0.8      3 -1.28   -0.457 -1.28    0.753   4.36  5.38
4   0.5   0.5  -0.8      4 -0.0737 -1.35  -0.0737 -0.749   4.96  4.63
5   0.5   0.5  -0.8      5  0.214  -0.145  0.214  -0.258   5.11  4.87
6   0.5   0.5  -0.8      6  0.128  -0.418  0.128  -0.353   5.06  4.82
(ggplot(data = simstudy) 
  + geom_point(aes(x = x, y = y), alpha = 0.5, size = 0.5)
  + geom_point(aes(x = 5, y= 5), col = 'red')
  + facet_grid(rho~sigx+sigy, labeller = label_both)
  + theme_bw()
)
Warning in geom_point(aes(x = 5, y = 5), col = "red"): All aesthetics have length 1, but the data has 200000 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

Question 7

N <- 10000

(simstudy_1 <- data.frame(x = rexp(N,1))
  %>% mutate(y = x + rexp(N,1))
) %>% head
           x         y
1 0.42121470 0.6018134
2 2.13490780 2.3286536
3 0.05156102 0.8476295
4 0.55439325 1.5502714
5 0.53949357 0.5980236
6 0.23527863 1.2999947
(ggplot(data = simstudy_1, aes(x = x, y = y))
  + geom_point(size = 0.2, alpha = 0.5)
  + geom_smooth(method = 'loess', se = FALSE,
                aes(color = 'Simulated E(Y|X)'),
                linewidth = 0.7)
  + geom_function(fun = \(x) x+1,
                  aes(color = 'Analytic E(Y|X)'),
                  linewidth = 0.7)
  +labs(color = '')
  +theme_classic()
  )
`geom_smooth()` using formula = 'y ~ x'

Estimating Part E.

(simstudy_1
 %>% summarize(prob1 = mean(x<=1 & y<=2),
               prob2 = mean(x <=1 & y <= 0.5))
 )
   prob1 prob2
1 0.4901 0.086

Estimating Part F.

(simstudy_1
 %>% filter(x <= 1)
 %>% summarize(prob = mean(y <= 2))
 )
       prob
1 0.7834079

Estimating Part G.

(simstudy_1
 %>% summarize(prob = mean(y< x^2))
 )
    prob
1 0.2289

Estimating Part H.

(simstudy_1
 %>% summarise(E_Y = mean(y),
               E_X = mean(x))
)
       E_Y      E_X
1 2.012746 1.014543

Estimating Part I.

(simstudy_1
 %>% summarise('cov(x,y)' = cov(x,y),
               'cor(x,y)' = cor(x,y))
 )
  cov(x,y) cor(x,y)
1 1.036988 0.720212

Estimating Part J.

(simstudy_1
 %>% summarise('E(Y-X)' = mean(y-x),
               'Var(Y-X)' = var(y-x))
 )
     E(Y-X)  Var(Y-X)
1 0.9982022 0.9610471

Estimating Part K.

model <- lm(y ~ x, data = simstudy_1)
summary(model)

Call:
lm(formula = y ~ x, data = simstudy_1)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.9991 -0.7044 -0.2957  0.3950  7.6464 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.999416   0.013835   72.24   <2e-16 ***
x           0.998803   0.009622  103.80   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9804 on 9998 degrees of freedom
Multiple R-squared:  0.5187,    Adjusted R-squared:  0.5187 
F-statistic: 1.078e+04 on 1 and 9998 DF,  p-value: < 2.2e-16