This is a small example showing the power of SIMEX with simex package to adjust for measurement errors in OLS models.
library(pacman)
p_load(kirkegaard, simex, rms)
options(digits = 3)
Simulate models with varying measurement errors added to the true variables, and see if we can get the right results back if we plug in the amount of measurement error.
We fit:
Settings:
#example
sim1_data = tibble(
a = rnorm(1000),
b = rnorm(1000),
ax = a + rnorm(1000) * .7,
bx = b + rnorm(1000) * .5,
y = a + b
)
#fits
(sim1_true_fit = lm(y ~ a + b, data = sim1_data))
##
## Call:
## lm(formula = y ~ a + b, data = sim1_data)
##
## Coefficients:
## (Intercept) a b
## 7.02e-18 1.00e+00 1.00e+00
(sim1_naive_fit = lm(y ~ ax + bx, data = sim1_data, x = T))
##
## Call:
## lm(formula = y ~ ax + bx, data = sim1_data, x = T)
##
## Coefficients:
## (Intercept) ax bx
## -0.0227 0.6249 0.8090
(sim1_simex_fit = simex(sim1_naive_fit, SIMEXvariable = c("ax", "bx"), measurement.error = cbind(.7, .5), lambda = seq(.1, 2, by = .1)))
##
## Naive model:
## lm(formula = y ~ ax + bx, data = sim1_data, x = T)
##
## SIMEX-Variables: ax, bx
## Number of Simulations: 100
##
## Coefficients:
## (Intercept) ax bx
## -0.014 0.832 0.985
#plot
plot(sim1_simex_fit)
Settings:
#parameters
# sim2_n_lower = 100
# sim2_n_higher = 10e3
sim2_ay = 1
sim2_by = .5
sim2_cy = .25
sim2_pred_cor_lower = .3
sim2_pred_cor_higher = .7
#pars to simulate
set.seed(1)
sim2_pars = expand_grid(
ay = sim2_ay,
by = sim2_by,
cy = sim2_cy,
rel_a = seq(.5, 1, .1),
rel_b = seq(.5, 1, .1),
rel_c = seq(.5, 1, .1)
)
#function for running
do_sim = function(ay, by, cy, rel_a, rel_b, rel_c, n_lower, n_higher, pred_cor_lower, pred_cor_higher) {
# browser()
#sample predictor correlations
pred_cors = runif(3, min = pred_cor_lower, max = pred_cor_higher)
#make matrix and smooth
cor_matrix = MAT_vector2full(pred_cors, diag_value = 1) %>%
cor.smooth()
# cor_matrix = matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3)
#sample size
# n_size = runif(1, min = n_lower, max = n_higher)
n_size = 2500
#sample cases for predictors
dd = MASS::mvrnorm(n = n_size,
mu = rep(0, 3),
Sigma = cor_matrix
# tol = 1e-4
) %>%
set_colnames(letters[1:3]) %>%
as_tibble()
#SD of errors
errors_sd = cbind(
sqrt(1 - rel_a),
sqrt(1 - rel_b),
sqrt(1 - rel_c)
)
#score y
dd %<>% mutate(
#true y
y = a * ay + b * by + c * cy,
#observed predictors
ax = a + rnorm(nrow(dd)) * errors_sd[1],
bx = b + rnorm(nrow(dd)) * errors_sd[2],
cx = c + rnorm(nrow(dd)) * errors_sd[3]
)
#fit true
true_fit = lm(y ~ a + b + c, data = dd, x = T, y = T)
#fit model normally
naive_fit = lm(y ~ ax + bx + cx, data = dd, x = T, y = T)
#SIMEX fit
simex_fit = simex(
naive_fit,
SIMEXvariable = letters[1:3] + "_o",
measurement.error = errors_sd
)
#return
list(
true_fit = true_fit,
naive_fit = naive_fit,
simex_fit = simex_fit,
data = dd
)
}
#run simulations
sim2_results = cache_object(
{map(seq_along_rows(sim2_pars), function(i) {
message(str_glue("{i} of {nrow(sim2_pars)}"))
#fit
do_sim(
ay = sim2_pars$ay[i],
by = sim2_pars$by[i],
cy = sim2_pars$cy[i],
rel_a = sim2_pars$rel_a[i],
rel_b = sim2_pars$rel_b[i],
rel_c = sim2_pars$rel_c[i],
n_lower = sim2_n_lower,
n_higher = sim2_n_higher,
pred_cor_lower = sim2_pred_cor_lower,
pred_cor_higher = sim2_pred_cor_higher
)
})},
filename = "sim2_results.rds"
)
## Cache found, reading object from disk
#plot results
#pull out the coefs from SIMEX and naive
sim2_results %>%
map_df(function(x) {
tibble(
#true
a_true = coef(x$true_fit)[2],
b_true = coef(x$true_fit)[3],
c_true = coef(x$true_fit)[4],
#naive
a_naive = coef(x$naive_fit)[2],
b_naive = coef(x$naive_fit)[3],
c_naive = coef(x$naive_fit)[4],
#simex
a_simex = coef(x$simex_fit)[2],
b_simex = coef(x$simex_fit)[3],
c_simex = coef(x$simex_fit)[4]
)
}) %>%
#long form
gather(key = model, value = beta, a_true:c_simex) %>%
#separate
separate(col = model, into = c("predictor", "method"), sep = "_") %>%
ggplot(aes(predictor, beta, fill = method)) +
geom_boxplot() +
# geom_point(data = tibble(
# predictor = letters[c(1:3)],
# # method = rep(c("naive", "simex"), each = 3),
# beta = rep(c(1, .5, .25), 1)
# ), shape = 0, size = 5, mapping = aes(predictor, beta, fill = NULL)) +
theme_bw()
Thus we see: