#5
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'purrrfect'
The following objects are masked from 'package:base':
replicate, tabulate
(home_samples <- rmultinom (10000 , size= 50 , prob = c (.73 , .20 , .07 ))
%>% t ()
%>% data.frame ()
%>% setNames (c ('Y1' , 'Y2' , 'Y3' ))
) %>% head ()
Y1 Y2 Y3
1 37 8 5
2 38 7 5
3 40 7 3
4 38 9 3
5 41 9 0
6 33 11 6
home_samples %>%
count (Y1, Y2) %>%
ggplot (aes (x = Y1, y = Y2, size = n)) +
geom_point (alpha = 0.5 ) +
labs (
x = "Fires in Family Homes" ,
y = "Fires in Apartments" ,
size = "Count" ,
title = "Most Common Combinations of Fires"
) +
theme_minimal ()
The most common combination seems to be right in the 38-39 fires in family homes and 11-12 in apartments
cov (home_samples) %>% round (2 )
Y1 Y2 Y3
Y1 9.91 -7.26 -2.65
Y2 -7.26 7.91 -0.65
Y3 -2.65 -0.65 3.30
The Analytic Covariance yields -7.30 while the simulation got -7.29. Therefore it approximated the analytic covariance fairly well.
6
library (tidyverse)
library (purrrfect)
#parameters
mu_x <- 5
mu_y <- 5
sig_x <- c (0.5 , 1 )
sig_y <- c (0.5 , 1 )
rho <- c (- 0.8 , - 0.4 , 0 , 0.4 , 0.8 )
(bvnorm_sim <- expand.grid (sig_x = sig_x, sig_y = sig_y, rho = rho) %>%
add_trials (10000 ) %>%
mutate (z1 = rnorm (n ()),
z2 = rnorm (n ()),
x_std = z1,
y_std = rho * z1 + sqrt (1 - rho^ 2 ) * z2,
x = sig_x * x_std + mu_x,
y = sig_y * y_std + mu_y))
# A tibble: 200,000 × 10
sig_x sig_y rho .trial z1 z2 x_std y_std x y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.5 0.5 -0.8 1 1.86 -1.79 1.86 -2.56 5.93 3.72
2 0.5 0.5 -0.8 2 0.128 1.18 0.128 0.604 5.06 5.30
3 0.5 0.5 -0.8 3 -0.569 1.10 -0.569 1.12 4.72 5.56
4 0.5 0.5 -0.8 4 -0.0449 0.690 -0.0449 0.450 4.98 5.23
5 0.5 0.5 -0.8 5 -0.379 0.150 -0.379 0.393 4.81 5.20
6 0.5 0.5 -0.8 6 -1.42 0.568 -1.42 1.47 4.29 5.74
7 0.5 0.5 -0.8 7 0.144 1.31 0.144 0.673 5.07 5.34
8 0.5 0.5 -0.8 8 0.0428 -0.793 0.0428 -0.510 5.02 4.75
9 0.5 0.5 -0.8 9 -1.95 -0.183 -1.95 1.45 4.02 5.73
10 0.5 0.5 -0.8 10 0.264 2.21 0.264 1.12 5.13 5.56
# ℹ 199,990 more rows
# A tibble: 6 × 10
sig_x sig_y rho .trial z1 z2 x_std y_std x y
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.5 0.5 -0.8 1 1.86 -1.79 1.86 -2.56 5.93 3.72
2 0.5 0.5 -0.8 2 0.128 1.18 0.128 0.604 5.06 5.30
3 0.5 0.5 -0.8 3 -0.569 1.10 -0.569 1.12 4.72 5.56
4 0.5 0.5 -0.8 4 -0.0449 0.690 -0.0449 0.450 4.98 5.23
5 0.5 0.5 -0.8 5 -0.379 0.150 -0.379 0.393 4.81 5.20
6 0.5 0.5 -0.8 6 -1.42 0.568 -1.42 1.47 4.29 5.74
Warning: package 'ggh4x' was built under R version 4.5.2
ggplot (data = bvnorm_sim) +
geom_point (aes (x = x, y = y), alpha = 0.3 , size = .3 ) +
geom_point (aes (x = 5 , y = 5 ), col= 'red' ) +
facet_nested (rho~ sig_x+ sig_y, labeller = label_both) +
theme_bw ()
Warning in geom_point(aes(x = 5, y = 5), col = "red"): All aesthetics have length 1, but the data has 200000 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
7
N <- 10000
Y <- rgamma (N, shape = 2 , rate = 1 ) # simulate Y
X <- runif (N, min = 0 , max = Y) # simulate X | Y
sim <- data.frame (X = X, Y = Y)
head (sim)
X Y
1 1.06730832 1.4415045
2 0.62167205 0.8101139
3 2.18439906 2.3651039
4 0.02172052 0.5788106
5 0.59801740 0.6952863
6 3.09291637 3.4937808
Scatterplot 7
ggplot (sim, aes (x = X, y = Y)) +
geom_point (alpha = 0.3 ) +
labs (x = "x" , y = "y" ,
title = "Scatterplot of 10,000 (X,Y) pairs" ) +
theme_minimal ()
superimpose means
ggplot (
data = sim,
aes (x = X, y = Y)
) +
geom_point (
shape = '.' ,
alpha = 0.5
) +
geom_smooth (
method = 'loess' ,
se = FALSE ,
aes (color = 'Simulated' ),
linewidth = 1
) +
geom_function (
fun = function (x) x + 1 ,
aes (color = 'Analytic' ),
linewidth = 1
) +
theme_classic () +
labs (
color = 'E[X | Y]'
)
`geom_smooth()` using formula = 'y ~ x'
Approximated E-J
p1 <- mean (sim$ X <= 1 & sim$ Y <= 2 )
p2 <- mean (sim$ X <= 1 & sim$ Y <= 0.5 )
# f
p_cond <- mean (sim$ Y <= 2 & sim$ X <= 1 ) / mean (sim$ X <= 1 )
# g
p_gamma <- pgamma (2 , shape = 2 , rate = 1 )
# h
mean_X <- mean (sim$ X)
mean_Y <- mean (sim$ Y)
# i
cov_XY <- cov (sim$ X, sim$ Y)
cor_XY <- cor (sim$ X, sim$ Y)
# j
Y_minus_X <- sim$ Y - sim$ X
mean_Y_minus_X <- mean (Y_minus_X)
var_Y_minus_X <- var (Y_minus_X)
# k
lm_fit <- lm (X ~ Y, data = sim)
coefficients <- summary (lm_fit)$ coefficients
p1
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.03174599 0.012289347 2.583212 0.009802582
Y 0.47834225 0.005001009 95.649140 0.000000000