Problem

A key part of the CODEm pipeline involves smoothing residuals from a stage 1 model. For observations with mean equal to 0 (e.g. a death rate with zero deaths observed), common approaches to calculating residuals are not feasible because 0 is at the exact bound of the probability distribution (i.e. log(0) and logit(0) are both negative infinity).

Background and goal

Chris mentioned that Brad Bell used a random effects approach to shrink residuals toward the mean estimate, effectively solving the issue of residuals for zero-valued observations. The goal is to explore the feasibility of this approach for our current methods development work.

Overview of the method

  1. Fit a stage 1 GLM model with covariates

    • e.g. regmod using binomial likelihood, with age, time, sex and location as covariates
  2. Fit a stage 2 GLMM (mixed model) with random intercepts by individual observation, a fixed intercept and one covariate. The covariate is the in-sample point prediction from stage 1.

    • Potentially need to set a strict uniform prior on the slope of the covariate, making it equal to 1
  3. Extract the random intercepts from the stage 2 model, which should include random intercepts for the zero-valued observations

  4. Due to shrinkage of the random intercepts toward the mean, multiply these pseudo-residuals by a scalar to return them to the scale of residuals observed in the non-zero observations

  5. Smooth these residuals as usual in the ST step

Limitations

Observations

Simulated data

#
# handling_zeros_example1.R
#


library(metafor)
library(msm)
library(lme4)

set.seed(123)

#####


library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
library(MASS)

set.seed(123)

make_binomial_data <- function(n_sims, true_dgp, samp_range ) {
  
  if (0) {
    n_sims = 100
    true_dgp = function(x) {0.002 + x*0.0005}
    samp_range = c(1000, 2000)
  }
  
  df_sim1 <- data.frame(x1 = seq(0, 10, length.out = n_sims)) %>%
    mutate(y_true = true_dgp(x = x1) ) %>%
    mutate(
      samp_size = round(runif(n = n_sims, min = samp_range[1], max = samp_range[2])),
      row_id = 1:nrow(.)
    )
  
  df_sim2 <- df_sim1 %>%
    mutate(
      y_binom_count = rbinom(n = nrow(.), size = samp_size, prob = y_true),
      y_binom_noncount = samp_size - y_binom_count,
      y_binom_mean = y_binom_count / samp_size
    )
  df_sim2$y_poisson_count <- mapply(
    FUN = function(x, n) sum(rpois(n = n, lambda = x)),
    x = df_sim2$y_true, 
    n = df_sim2$samp_size
  )
  
  df_sim2$y_poisson_mean <- df_sim2$y_poisson_count / df_sim2$samp_size
  
  return(df_sim2)
  
}

df1 <- make_binomial_data(
  n_sims = 250,
  # true_dgp = function(x) {0.002 + x*0.005},
  # true_dgp = function(x) {0.002 + x*-0.0001},
  true_dgp = function(x) {0.0007 + x*0.0002},
  samp_range = c(3000, 4000) ) %>%
  mutate(
    y_binom_mean_logit = qlogis(y_binom_mean)
  )

## NOTE: think about using a different sample size for fitting than making the data,
#        so there's unexplained between-study heterogeneity
#


fit0 <- glm(
  formula = as.matrix(df1[, c("y_binom_count", "y_binom_noncount")]) ~ 1 + x1,
  # formula = as.matrix(df1[, c("y_binom_count", "y_binom_noncount")]) ~ 1, 
  data = df1,
  family = "binomial"
)

df1$pred0 <- predict(fit0, newdata = df1, re.form = NA)

with(df1, plot(x1, y_binom_mean))
with(df1, lines(x1, plogis(pred0)))

. .

fit1 <- glmer(
  formula = as.matrix(df1[, c("y_binom_count", "y_binom_noncount")]) ~ 1 + pred0 + (1 | row_id),
  # formula = as.matrix(df1[, c("y_binom_count", "y_binom_noncount")]) ~ 1 + (1 | row_id),
  data = df1,
  family = "binomial"
)

tmp_summary <- summary(fit1)
tmp_summary
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: as.matrix(df1[, c("y_binom_count", "y_binom_noncount")]) ~ 1 +  
##     pred0 + (1 | row_id)
##    Data: df1
## 
##      AIC      BIC   logLik deviance df.resid 
##   1161.5   1172.0   -577.7   1155.5      247 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4421 -0.6754 -0.1291  0.5394  3.5322 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  row_id (Intercept) 0.02507  0.1583  
## Number of obs: 250, groups:  row_id, 250
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.03442    0.51512   0.067    0.947    
## pred0        1.00737    0.08124  12.400   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr)
## pred0 0.998
df1$pred1 <- predict(fit1, re.form = NA) # predict without random effect
df1$pred2 <- predict(fit1, re.form = NULL) # predict with random effects


df1$ranefs_check <- ranef(fit1)[[1]][, 1]

Multiply the pseudo-residuals by a scalar (e.g. 3)

df2 <- df1 %>%
  mutate(
    ranef = pred2 - pred1,
    pred3 = pred1 + 3 * ranef,
    pred3_plogis = plogis(pred3)
  )

table(df2$y_binom_mean == 0)
## 
## FALSE  TRUE 
##   246     4
# with(df2, plot(x1, y_binom_mean, ylim = c(0, max(c(plogis(pred3), y_binom_mean)))))
with(df2, plot(x1, y_binom_mean))
with(df2, lines(x1, plogis(pred0)))

with(df2, points(x1, plogis(pred3), col = adjustcolor("red", alpha.f = 0.3), pch = 16))

for (i in 1:nrow(df2)) {
  if (0) {
    i <- 1
  }
  lines(
    x = rep(df2[i, "x1"], 2), 
    y = df2[i, c("y_binom_mean", "pred3_plogis")],
    col = adjustcolor("darkblue", alpha.f = 0.4)
  )
}


fit2 <- glm(pred3_plogis ~ x1, data = df2, family = "quasibinomial")
df2$pred4 <- predict(fit2)
with(df2, lines(x1, plogis(pred4), col = "red", lty = 2))

Note that due to the distributional mismatch between binomial and the random effects, using a scalar that’s too large will lead to bias

  • Scalar=10 in this example