PS 5.7-5.8

  1. Consider Question 1, but now suppose 50 homes are sampled instead of 4. Simulate 10,000 replications of this sampling scheme. Visualize the joint distribution of Y_1= number of fires in family homes and Y_2= number of fires in an apartment. Make sure to use appropriate methods for avoiding overplotting. Answer the following.
  1. Which (Y_1,Y_2) combination is most common?
  2. Simulate Cov(Y_1,Y_2). How well does it approximate the analytic covariance?
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(GGally)
(houses_samples <- rmultinom(10000, size = 50, prob = c(.73, .2, .07))
  %>% t()
  %>% data.frame()
  %>% setNames(c('Y1','Y2','Y3'))
) %>% head
  Y1 Y2 Y3
1 34 14  2
2 34 11  5
3 39  8  3
4 36 11  3
5 41  7  2
6 35 12  3
ggpairs(houses_samples, 
          lower = list(continuous = wrap("points", 
                                         alpha = 0.6,
                                         position = position_jitter(width = 0.2, 
                                                                    height = 0.2)
                                         )
                       )
        ) + 
  theme_classic(base_size = 1)

houses_samples %>%
  count(Y1, Y2, sort = TRUE) %>%
  head()
  Y1 Y2   n
1 37 10 312
2 38  9 303
3 36 11 291
4 37 11 269
5 36 10 268
6 37  9 265

The most common Y1 and Y2 combination is 37 and 10.

sim_cov <- cov(houses_samples$Y1, houses_samples$Y2)
analytic_cov <- -50 * 0.73 * 0.20
sim_cov - analytic_cov
[1] 0.06252607

They are very similiar results with only a .04 difference.

  1. Consider the problem of simulating bivariate normal data from Lecture 5.8. Using parameters() and add_trials() from the purrrfect package, and the relationships given on Slide 12, write the code that will produce the simulated data on Slide 13. This data set consists of 10000 realizations of (X,Y)∼BVN(μ_X=5,μ_Y=5,σ_X2,σ_Y2,ρ) for each combination of σ_X∈{0.5,1}, σ_Y∈{0.5,1}, and ρ∈{-0.8,-0.4,0,0.4,0.8}. Use this data set to reproduce the visualizations on Slide 14.
library(tidyverse)
library(purrrfect)

Attaching package: 'purrrfect'
The following objects are masked from 'package:base':

    replicate, tabulate
library(purrr)
library(ggplot2)

N <- 10000
(simstudy <- parameters(~sigx, ~sigy, ~rho,
                        c(0.5,1),c(0.5,1),c(-0.8,-0.4,0,0.4,0.8)
                        )
  %>%add_trials(10000)
  %>%mutate(z1 = rnorm(n()),
    z2 = rnorm(n()),
    x_std = z1,
    y_std = rho * z1 + sqrt(1 - rho^2) * z2,
    x = sigx * x_std + 5,
    y = sigy * y_std + 5
  )
) %>% head(3)
# A tibble: 3 × 10
   sigx  sigy   rho .trial     z1      z2  x_std  y_std     x     y
  <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl> <dbl> <dbl>
1   0.5   0.5  -0.8      1  1.35   0.0123  1.35  -1.07   5.67  4.46
2   0.5   0.5  -0.8      2 -0.881 -0.259  -0.881  0.549  4.56  5.27
3   0.5   0.5  -0.8      3 -0.738  0.499  -0.738  0.889  4.63  5.44
ggplot(simstudy, aes(x = x, y = y)) +
  geom_point(alpha = 0.5, size = 0.5) +
  geom_point(aes(x = 5, y = 5), color = "red") +
  facet_grid(
    rho ~ sigx + sigy,
    labeller = label_both
  ) +
  theme_bw()
Warning in geom_point(aes(x = 5, y = 5), color = "red"): All aesthetics have length 1, but the data has 200000 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

  1. Consider Question 4. Simulate 10,000 realizations of (X,Y) pairs. Create a scatterplot of these (X,Y) pairs. Superimpose the analytic and simulated conditional mean function. Then use the simulated data to approximate the answers to e)-j). Use lm() to estimate the regression coefficients β_0 and β_1 for part k).
library(tidyverse)

N <- 10000
(sim_study <- data.frame(x = rexp(N, rate = 1)) 
              %>% mutate(y = map_dbl(x, \(x) x + rexp(1,1)))
)  %>% head
          x         y
1 0.8015773 2.0204804
2 0.7599702 1.3518701
3 1.8424854 1.9670407
4 0.2127960 0.8643048
5 1.9040161 2.4508210
6 0.7748084 2.1749495
ggplot(data = sim_study,aes(x = x, y = y)) + 
  geom_point(size = .2, alpha = .5)  + 
  theme_classic()

ggplot(data = sim_study,aes(x = x, y = y)) + 
  geom_point(size = .2, alpha = .5)  + 
  geom_smooth(method='loess', se= FALSE, 
              aes(color='Simulated E(Y|X)'),
              size =.7) + 
  geom_function(fun = \(x) x + 1, 
                aes(color='Analytic E(Y|X)'),
                size = .7) + 
  labs(color='') + 
  theme_classic()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
`geom_smooth()` using formula = 'y ~ x'

#e
(sim_study
    %>% summarize('P(X<=1, Y<=2)' = mean(x<=1 & y<=2),
                  'P(X<=1, Y<=.5)' = mean(x<=1 & y<=.5),
                  )
 )
  P(X<=1, Y<=2) P(X<=1, Y<=.5)
1         0.504         0.0929
#f
(sim_study
    %>% filter(x<=1)
    %>% summarize('P(Y<=2 | X<=1)' = mean(y<=2)
                  )
 )
  P(Y<=2 | X<=1)
1      0.7862715
#g
(sim_study
  %>% summarize('P(Y>= X^2)' = mean(y>=x^2)
                  )
)
  P(Y>= X^2)
1     0.7805
#h
(sim_study
  %>% summarize('E(y)' = mean(y),
                'E(x)' = mean(x)
                  )
)
      E(y)     E(x)
1 1.983346 0.980015
#i
sim_study %>% 
  cov
          x         y
x 0.9457641 0.9419994
y 0.9419994 1.9914105
sim_study %>%
  cor
          x         y
x 1.0000000 0.6864024
y 0.6864024 1.0000000
#j
(sim_study
  %>% summarize('E(y-x)' = mean(y-x),
                'Var(y-x)' = var(y-x)
                  )
)
    E(y-x) Var(y-x)
1 1.003331 1.053176
#k
fit <- sim_study %>%
  lm(y ~ x, data = .)

summary(fit)

Call:
lm(formula = y ~ x, data = .)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.0063 -0.7156 -0.3270  0.3832  9.4648 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.00723    0.01457   69.13   <2e-16 ***
x            0.99602    0.01055   94.38   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.026 on 9998 degrees of freedom
Multiple R-squared:  0.4711,    Adjusted R-squared:  0.4711 
F-statistic:  8907 on 1 and 9998 DF,  p-value: < 2.2e-16