Simulations

Simulate models with varying measurement errors added to the true variables, and see if we can get the right results back if we plug in the amount of measurement error.

We fit:

True model, for benchmarking, based on the unobserved predictors.
Naive model, observed variables, ignoring measurement reliability issues.
SIMEX model, with reliabilities entered. Should get us back (1).

Simulation 1: Constant true slopes, constant reliability, 2 uncorrelated predictors

Settings:

Sample size is 1000.
Two predictors: slopes are both 1.
Predictors are uncorrelated.
Reliability is set at .7 and .5.
The true reliability is known and used in SIMEX.

#example
sim1_data = tibble(
  a = rnorm(1000),
  b = rnorm(1000),
  ax = a + rnorm(1000) * .7,
  bx = b + rnorm(1000) * .5,
  y = a + b
)

#fits
(sim1_true_fit = lm(y ~ a + b, data = sim1_data))

## 
## Call:
## lm(formula = y ~ a + b, data = sim1_data)
## 
## Coefficients:
## (Intercept)            a            b  
##    7.02e-18     1.00e+00     1.00e+00

(sim1_naive_fit = lm(y ~ ax + bx, data = sim1_data, x = T))

## 
## Call:
## lm(formula = y ~ ax + bx, data = sim1_data, x = T)
## 
## Coefficients:
## (Intercept)           ax           bx  
##     -0.0227       0.6249       0.8090

(sim1_simex_fit = simex(sim1_naive_fit, SIMEXvariable = c("ax", "bx"), measurement.error = cbind(.7, .5), lambda = seq(.1, 2, by = .1)))

## 
## Naive model:
## lm(formula = y ~ ax + bx, data = sim1_data, x = T)
## 
## SIMEX-Variables: ax, bx
## Number of Simulations: 100
## 
## Coefficients:
## (Intercept)           ax           bx  
##      -0.014        0.832        0.985

#plot
plot(sim1_simex_fit)

Simulation 2: Constant true slopes, varying reliability, 3 correlated predictors

Settings:

Sample size is 2500 constant.
Three predictors: slopes are 1, .5, .25, constant.
Correlations among predictors vary at random between .3 and .7, uniform distribution.
Reliability varies at random between .5 and 1, uniform distribution. They are uncorrelated between predictors.
The true reliability is known and used in SIMEX.

#parameters
# sim2_n_lower = 100
# sim2_n_higher = 10e3
sim2_ay = 1
sim2_by = .5
sim2_cy = .25
sim2_pred_cor_lower = .3
sim2_pred_cor_higher = .7

#pars to simulate
set.seed(1)
sim2_pars = expand_grid(
  ay = sim2_ay,
  by = sim2_by,
  cy = sim2_cy,
  rel_a = seq(.5, 1, .1),
  rel_b = seq(.5, 1, .1),
  rel_c = seq(.5, 1, .1)
)

#function for running
do_sim = function(ay, by, cy, rel_a, rel_b, rel_c, n_lower, n_higher, pred_cor_lower, pred_cor_higher) {
  # browser()
  #sample predictor correlations
  pred_cors = runif(3, min = pred_cor_lower, max = pred_cor_higher)
  
  #make matrix and smooth
  cor_matrix = MAT_vector2full(pred_cors, diag_value = 1) %>% 
    cor.smooth()
  # cor_matrix = matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow = 3)
  
  #sample size
  # n_size = runif(1, min = n_lower, max = n_higher)
  n_size = 2500
  
  #sample cases for predictors
  dd = MASS::mvrnorm(n = n_size,
                     mu = rep(0, 3),
                     Sigma = cor_matrix
                     # tol = 1e-4
                     ) %>% 
    set_colnames(letters[1:3]) %>% 
    as_tibble()
  
  #SD of errors
  errors_sd = cbind(
    sqrt(1 - rel_a),
    sqrt(1 - rel_b),
    sqrt(1 - rel_c)
    )
  
  #score y
  dd %<>% mutate(
    #true y
    y = a * ay + b * by + c * cy,
    
    #observed predictors
    ax = a + rnorm(nrow(dd)) * errors_sd[1],
    bx = b + rnorm(nrow(dd)) * errors_sd[2],
    cx = c + rnorm(nrow(dd)) * errors_sd[3]
  )
  
  #fit true
  true_fit = lm(y ~ a + b + c, data = dd, x = T, y = T)
  
  #fit model normally
  naive_fit = lm(y ~ ax + bx + cx, data = dd, x = T, y = T)
  
  #SIMEX fit
  simex_fit = simex(
    naive_fit,
    SIMEXvariable = letters[1:3] + "_o",
    measurement.error = errors_sd
    )
  
  #return
  list(
    true_fit = true_fit,
    naive_fit = naive_fit,
    simex_fit = simex_fit,
    data = dd
  )
}

#run simulations
sim2_results = cache_object(
  {map(seq_along_rows(sim2_pars), function(i) {
  message(str_glue("{i} of {nrow(sim2_pars)}"))
  
  #fit
  do_sim(
    ay = sim2_pars$ay[i],
    by = sim2_pars$by[i],
    cy = sim2_pars$cy[i],
    rel_a = sim2_pars$rel_a[i],
    rel_b = sim2_pars$rel_b[i],
    rel_c = sim2_pars$rel_c[i],
    n_lower = sim2_n_lower,
    n_higher = sim2_n_higher,
    pred_cor_lower = sim2_pred_cor_lower,
    pred_cor_higher = sim2_pred_cor_higher
  )
})},
filename = "sim2_results.rds"
)

## Cache found, reading object from disk

#plot results
#pull out the coefs from SIMEX and naive
sim2_results %>% 
  map_df(function(x) {
    tibble(
      #true
      a_true = coef(x$true_fit)[2],
      b_true = coef(x$true_fit)[3],
      c_true = coef(x$true_fit)[4],
      
      #naive
      a_naive = coef(x$naive_fit)[2],
      b_naive = coef(x$naive_fit)[3],
      c_naive = coef(x$naive_fit)[4],
      
      #simex
      a_simex = coef(x$simex_fit)[2],
      b_simex = coef(x$simex_fit)[3],
      c_simex = coef(x$simex_fit)[4]
    )
  }) %>% 
  #long form
  gather(key = model, value = beta, a_true:c_simex) %>% 
  #separate
  separate(col = model, into = c("predictor", "method"), sep = "_") %>% 
  ggplot(aes(predictor, beta, fill = method)) +
  geom_boxplot() +
  # geom_point(data = tibble(
  #   predictor = letters[c(1:3)],
  #   # method = rep(c("naive", "simex"), each = 3),
  #   beta = rep(c(1, .5, .25), 1)
  # ), shape = 0, size = 5, mapping = aes(predictor, beta, fill = NULL)) +
  theme_bw()

Thus we see:

The green boxes are closer to the blue, flat boxes than the red boxes.
The green boxes do not always have the mean at the mean of the blue boxes, that is, SIMEX does not always remove all the bias.
For predictor a, SIMEX under-estimates true slope, for predictor c, it overestimates it. But both times it is closer to the truth than ignoring the problem, as with the red boxes. Thus, SIMEX is preferable to naive approach.

SIMEX example: the simulation and extrapolation method to measurement error in models

About

Init

Simulations

Simulation 1: Constant true slopes, constant reliability, 2 uncorrelated predictors

Simulation 2: Constant true slopes, varying reliability, 3 correlated predictors