5.7/5.8

#5

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(purrrfect)


Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate

(home_samples <- rmultinom(10000, size=50, prob = c(.73, .20, .07))
 %>% t()
 %>% data.frame()
 %>% setNames(c('Y1', 'Y2', 'Y3'))
 ) %>% head()

home_samples %>%
  count(Y1, Y2) %>%
  ggplot(aes(x = Y1, y = Y2, size = n)) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Fires in Family Homes",
    y = "Fires in Apartments",
    size = "Count",
    title = "Most Common Combinations of Fires"
  ) +
  theme_minimal()

The most common combination seems to be right in the 38-39 fires in family homes and 11-12 in apartments

cov(home_samples) %>% round(2)

      Y1    Y2    Y3
Y1  9.91 -7.26 -2.65
Y2 -7.26  7.91 -0.65
Y3 -2.65 -0.65  3.30

The Analytic Covariance yields -7.30 while the simulation got -7.29. Therefore it approximated the analytic covariance fairly well.

6

library(tidyverse)
library(purrrfect)  
#parameters
mu_x <- 5
mu_y <- 5
sig_x <- c(0.5, 1)
sig_y <- c(0.5, 1)
rho   <- c(-0.8, -0.4, 0, 0.4, 0.8)

(bvnorm_sim <- expand.grid(sig_x = sig_x, sig_y = sig_y, rho = rho) %>%
   add_trials(10000) %>%
   mutate(z1    = rnorm(n()),
          z2    = rnorm(n()),
          x_std = z1,
          y_std = rho * z1 + sqrt(1 - rho^2) * z2,
          x     = sig_x * x_std + mu_x,
          y     = sig_y * y_std + mu_y))

# A tibble: 200,000 × 10
   sig_x sig_y   rho .trial      z1     z2   x_std  y_std     x     y
   <dbl> <dbl> <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl>
 1   0.5   0.5  -0.8      1  1.86   -1.79   1.86   -2.56   5.93  3.72
 2   0.5   0.5  -0.8      2  0.128   1.18   0.128   0.604  5.06  5.30
 3   0.5   0.5  -0.8      3 -0.569   1.10  -0.569   1.12   4.72  5.56
 4   0.5   0.5  -0.8      4 -0.0449  0.690 -0.0449  0.450  4.98  5.23
 5   0.5   0.5  -0.8      5 -0.379   0.150 -0.379   0.393  4.81  5.20
 6   0.5   0.5  -0.8      6 -1.42    0.568 -1.42    1.47   4.29  5.74
 7   0.5   0.5  -0.8      7  0.144   1.31   0.144   0.673  5.07  5.34
 8   0.5   0.5  -0.8      8  0.0428 -0.793  0.0428 -0.510  5.02  4.75
 9   0.5   0.5  -0.8      9 -1.95   -0.183 -1.95    1.45   4.02  5.73
10   0.5   0.5  -0.8     10  0.264   2.21   0.264   1.12   5.13  5.56
# ℹ 199,990 more rows

head(bvnorm_sim)

# A tibble: 6 × 10
  sig_x sig_y   rho .trial      z1     z2   x_std  y_std     x     y
  <dbl> <dbl> <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl>
1   0.5   0.5  -0.8      1  1.86   -1.79   1.86   -2.56   5.93  3.72
2   0.5   0.5  -0.8      2  0.128   1.18   0.128   0.604  5.06  5.30
3   0.5   0.5  -0.8      3 -0.569   1.10  -0.569   1.12   4.72  5.56
4   0.5   0.5  -0.8      4 -0.0449  0.690 -0.0449  0.450  4.98  5.23
5   0.5   0.5  -0.8      5 -0.379   0.150 -0.379   0.393  4.81  5.20
6   0.5   0.5  -0.8      6 -1.42    0.568 -1.42    1.47   4.29  5.74

library(ggh4x)

Warning: package 'ggh4x' was built under R version 4.5.2

ggplot(data = bvnorm_sim) + 
  geom_point(aes(x = x, y = y), alpha = 0.3, size = .3) + 
  geom_point(aes(x = 5, y = 5), col='red') + 
  facet_nested(rho~sig_x+sig_y, labeller = label_both) + 
  theme_bw()

Warning in geom_point(aes(x = 5, y = 5), col = "red"): All aesthetics have length 1, but the data has 200000 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.

7

N <- 10000

Y <- rgamma(N, shape = 2, rate = 1)  # simulate Y
X <- runif(N, min = 0, max = Y)      # simulate X | Y

sim <- data.frame(X = X, Y = Y)      
head(sim)

           X         Y
1 1.06730832 1.4415045
2 0.62167205 0.8101139
3 2.18439906 2.3651039
4 0.02172052 0.5788106
5 0.59801740 0.6952863
6 3.09291637 3.4937808

Scatterplot 7

ggplot(sim, aes(x = X, y = Y)) +
  geom_point(alpha = 0.3) +
  labs(x = "x", y = "y", 
       title = "Scatterplot of 10,000 (X,Y) pairs") +
  theme_minimal()

superimpose means

ggplot(
  data = sim,
  aes(x = X, y = Y)
) +
  geom_point(
    shape = '.',
    alpha = 0.5
  ) +
  geom_smooth(
    method = 'loess',
    se = FALSE,
    aes(color = 'Simulated'),
    linewidth = 1
  ) +
geom_function(
  fun = function(x) x + 1,
  aes(color = 'Analytic'),
  linewidth = 1
) +
  theme_classic() +
  labs(
    color = 'E[X | Y]'
  )

`geom_smooth()` using formula = 'y ~ x'

Approximated E-J

p1 <- mean(sim$X <= 1 & sim$Y <= 2)
p2 <- mean(sim$X <= 1 & sim$Y <= 0.5)

# f
p_cond <- mean(sim$Y <= 2 & sim$X <= 1) / mean(sim$X <= 1)

# g
p_gamma <- pgamma(2, shape = 2, rate = 1)

# h
mean_X <- mean(sim$X)
mean_Y <- mean(sim$Y)

# i
cov_XY <- cov(sim$X, sim$Y)
cor_XY <- cor(sim$X, sim$Y)

# j
Y_minus_X <- sim$Y - sim$X
mean_Y_minus_X <- mean(Y_minus_X)
var_Y_minus_X <- var(Y_minus_X)

# k
lm_fit <- lm(X ~ Y, data = sim)
coefficients <- summary(lm_fit)$coefficients

p1

[1] 0.4971

p2

[1] 0.0872

p_cond

[1] 0.7816038

p_gamma

[1] 0.5939942

mean_X

[1] 0.9886817

mean_Y

[1] 2.000525

cov_XY

[1] 0.9742808

cor_XY

[1] 0.6912473

mean_Y_minus_X

[1] 1.011843

var_Y_minus_X

[1] 1.063564

coefficients

              Estimate  Std. Error   t value    Pr(>|t|)
(Intercept) 0.03174599 0.012289347  2.583212 0.009802582
Y           0.47834225 0.005001009 95.649140 0.000000000