Performance of counts models under misclassification scenarios

Create simulated data

create_simulated_data <- function(
    n_iterations, n_obs_per_iteration, 
    x_range, b0, b1, population_size, SE, SP, 
    seed_default = 0) {
  
  if (0) {
    n_iterations <- 10
    n_obs_per_iteration <- 50
    b0 <- -3
    b1 <- -0.15
    x_range <- c(0, 10)
    population_size <- 100
    seed_default <- 0
    SE <- 0.9
    SP <- 1.0
  }
  
  require(dplyr)
  source("functions.R")
  
  df_simtmp <- data.frame(row_id = 1:n_obs_per_iteration) %>%
    mutate(
      x1 = seq(from = x_range[1], to = x_range[2], length.out = nrow(.)),
      y_logit_true = b0 + b1*x1,
      p_true = plogis(y_logit_true), # convert from logit to linear space
      pop = population_size
    )
  
  for (i in 1:n_iterations) {
    
    if (0) {
      i <- 1
    }
    set.seed(seed_default + i)
    
    prob_vals <- df_simtmp$p_true
    mat_method2 <- matrix(data = as.character(NA), nrow = population_size, ncol = length(prob_vals))
    
    for (j in 1:length(prob_vals)) {
      if (0) {
        j <- 1
      }
      ct_tmp <- make_crosstable(
        mu = prob_vals[j], 
        sens_tmp = SE, 
        spec_tmp = SP
      )
      ct_samp <- sample_from_crosstable(n_samples = 1, pop_size = population_size, ct_tmp = ct_tmp)
      mat_method2[, j] <- ct_samp$df_vals1[[1]]$x1
      
    }
    
    mat_method3 <- matrix(mat_method2 %in% c("tp", "fp"), ncol = dim(mat_method2)[2])
    prevs1 <- colSums(mat_method3) / population_size
    df_simtmp[, paste0("p", i)] <- prevs1
    
  }
  
  return(df_simtmp)
}

Function for running models, given a simulated dataset

run_models <- function(simulated_data) {
  if (0) {
    
    simulated_data <- create_simulated_data(
      n_iterations = 10, n_obs_per_iteration = 50, b0 = qlogis(0.1), b1 = 0,
      x_range = c(0, 10), population_size = 100, seed_default = 0, SE = 0.9, SP = 1.0
    )
    
  }
  
  y_vars <- names(simulated_data)[names(simulated_data) %in% paste0("p", 1:1000)]
    
  results1 <- sapply(y_vars, function(var) {
    if (0) {
      var <- y_vars[1]
    }
    
    df_iter <- simulated_data[, c("x1", var, "pop")]
    names(df_iter) <- c("x1", "pvar_tmp", "pop")
    df_iter$count <- df_iter$pvar_tmp * df_iter$pop
    
    df_pred <- data.frame(x1 = 0, pop = 1)
    
    ymat <- df_iter %>%
      mutate(
        noncount = pop - count ) %>%
      .[, c("count", "noncount")]
      
    #####
    # fit binomial
    fit_binomial <- glm(
      formula = cbind(ymat$count, ymat$noncount) ~ 1,
      data = df_iter,
      family = binomial()
    )
    rd_binomial <- deviance(fit_binomial)
    aic_binomial <- AIC(fit_binomial)
    (b0pred_binomial <- predict(fit_binomial, newdata = df_pred, type = "response"))
    
    #####
    # fit poisson
    fit_poisson <- glm(
      # formula = count ~ x1 + offset(log(pop)), 
      formula = count ~ 1 + offset(log(pop)), 
      data = df_iter,
      family = poisson()
    )
    rd_poisson <- deviance(fit_poisson)
    aic_poisson <- AIC(fit_poisson)
    (b0pred_poisson <- predict(fit_poisson, newdata = df_pred, type = "response"))
    
    
    #####
    # fit quasipoisson
    # -- formula for extracting log-likelihood from a quasi model is from here (page 2):
    #    https://cran.r-project.org/web/packages/bbmle/vignettes/quasi.pdf
    fit_qpois <- glm(
      formula = count ~ 1 + offset(log(pop)), 
      family = quasipoisson(), data = df_iter)
    # deviance(fit_qpois)
    # (deviance(fit_qpois)-deviance(fit_qpois))
    
    # If we square the Deviance Residuals and add them all up, we get the Residual Deviance statistic
    # https://data.library.virginia.edu/understanding-deviance-residuals/
    rd_qpois <- sum(residuals(fit_qpois, type = "deviance")^2)
    require(MuMIn)
    
    # adapted from: 
    # https://search.r-project.org/CRAN/refmans/MuMIn/html/QAIC.html
    hacked.quasipoisson <- function(...) {
      res <- quasipoisson(...)
      res$aic <- poisson(...)$aic
      res
    }
    chat_tmp <- deviance(fit_poisson) / fit_poisson$df.residual
    # chat_tmp <- deviance(fit_qpois) / fit_qpois$df.residual
    aic_qpois <- QAIC(update(fit_poisson, family = hacked.quasipoisson), chat = chat_tmp)
    
    (b0pred_qpois <- predict(fit_qpois, newdata = df_pred, type = "response"))
    
    # https://cran.r-project.org/web/packages/bbmle/vignettes/quasi.pdf
    # Comparing Poisson and quasi-Poisson...
    # "The deviance (deviance(glmOT.D93)=5.129 is not the same as −2L (-2*logLik(glmOT.D93)=46.761),
    # but the calculated differences in deviance are consistent, and are also extractable
    # from the quasi- fit even though the log-likelihood is NA"
    
    #####
    # fit beta-binomial
    require(aod)
    
    fit_bbinom <- aod::betabin(
      # formula = cbind(ymat$count, ymat$noncount) ~ x1,
      formula = cbind(ymat$count, ymat$noncount) ~ 1,
      random = ~0,
      # random = ~1,
      data = df_iter
    )
    rd_bbinom <- deviance(fit_bbinom)
    tmp <- summary(fit_bbinom)
    aic_bbinom <- (2*tmp@object@nbpar) - (2*tmp@object@logL)
    
    (b0pred_bbinom <- predict(fit_bbinom, newdata = df_pred))
    
    #####
    # fit tobit

    require(AER)
    fit_tobit2 <- AER::tobit(
      # formula = pvar_tmp ~ x1, 
      formula = pvar_tmp ~ 1, 
      left = 0, right = Inf, data = df_iter)
    loglik_tobit <- logLik(fit_tobit2)
    rd_tobit <- sum(residuals(fit_tobit2, type = "deviance")^2)
    aic_tobit <- -1 * AIC(fit_tobit2) # maybe needs to be flipped for comparability with the others
    # aic_tobit <- AIC(fit_tobit2) # maybe needs to be flipped for comparability with the others
    (b0pred_tobit <- predict(fit_tobit2, newdata = df_pred))
    
    # adapted from: https://stats.stackexchange.com/questions/149091/censored-regression-in-r
    mu <- unique(fitted(fit_tobit2))
    sigma <- fit_tobit2$scale
    p0 <- pnorm(mu/sigma)
    lambda <- function(x) dnorm(x)/pnorm(x)
    ey0 <- mu + sigma * lambda(mu/sigma)
    ey <- p0 * ey0

    
    
    out <- list(
      rd_binomial=rd_binomial, aic_binomial=aic_binomial, b0pred_binomial=b0pred_binomial,
      rd_poisson=rd_poisson, aic_poisson=aic_poisson, b0pred_poisson=b0pred_poisson,
      rd_qpois=rd_qpois, aic_qpois=aic_qpois, b0pred_qpois=b0pred_qpois,
      rd_bbinom=rd_bbinom, aic_bbinom=aic_bbinom, b0pred_bbinom=b0pred_bbinom,
      rd_tobit=rd_tobit, aic_tobit=aic_tobit, b0pred_tobit=b0pred_tobit,
      b0pred_tobit2=ey
    )
    
    return(out)
    
  }, simplify = FALSE)
  
  return(results1)
}

Run scenarios

b0_true <- qlogis(0.02)
# b0_true <- qlogis(0.15)

scenarios1 <- sapply(as.character(c(1.0, 0.9, 0.8, 0.7)), function(x) {
  if (0) {
    x <- 0.9
  }
  cat(x, "\n")
  simdata_tmp <- create_simulated_data(
    n_iterations = 50, n_obs_per_iteration = 100, b0 = b0_true, b1 = 0,
    x_range = c(0, 10), population_size = 100, seed_default = 0, SE = as.numeric(x), SP = 1.0
  )
  results_tmp <- run_models(simulated_data = simdata_tmp)
  
  if (0) {
    str(results_tmp$p1)
  }
  
  out <- list(
    results_tmp=results_tmp,
    simdata_tmp=simdata_tmp
  )
  return(out)
  
}, simplify = FALSE)

## 1 
## 0.9 
## 0.8 
## 0.7

Compile results

df_final <- bind_rows(lapply(names(scenarios1), function(x) {
  if (0) {
    x <- names(scenarios1)[1]
  }
  
  require(tidyr)
  require(stringr)
  
  dat_tmp <- bind_rows(scenarios1[[x]]$results_tmp) %>%
    mutate(
      iter = 1:nrow(.),
      scenario = x ) %>%
    pivot_longer(!iter & !scenario) %>%
    separate(name, into = c("metric", "distribution"), sep = "_")
    
  
}))

df_alldata <- bind_rows(lapply(names(scenarios1), function(x) {
  if (0) {
    x <- names(scenarios1)[1]
  }
  
  dat_tmp1 <- bind_rows(scenarios1[[x]]$simdata_tmp)
  dat_tmp2 <- dat_tmp1[names(dat_tmp1)[names(dat_tmp1) %in% paste0("p", 1:1000)]] %>%
    mutate(
      scenario = x ) %>%
    pivot_longer(!scenario) %>%
    mutate(iter = as.numeric(gsub("p", "", name))) %>%
    select(-name)
}))

Plots

require(ggplot2)
require(ggExtra)

ggplot2::ggplot(data = df_alldata) +
  geom_histogram(mapping = aes(x = value)) +
  ggtitle("Data distribution; all iterations pooled") +
  facet_wrap(~scenario, ncol = 1) +
  geom_vline(xintercept = plogis(b0_true))

ggplot2::ggplot(data = df_final %>% filter(metric == "b0pred")) +
  geom_violin(mapping = aes(x = distribution, y = value)) +
  ggtitle("Prediction of b0") +
  facet_wrap(~scenario) +
  geom_hline(yintercept = plogis(b0_true))

ggplot2::ggplot(data = df_final %>% filter(metric == "aic")) +
  geom_violin(mapping = aes(x = distribution, y = value)) +
  facet_wrap(~scenario) + 
  ggtitle("AIC")

ggplot2::ggplot(data = df_final %>% filter(metric == "rd")) +
  geom_violin(mapping = aes(x = distribution, y = value)) +
  facet_wrap(~scenario) +
  ggtitle("Residual deviance")

Performance of counts models under misclassification scenarios

Reed Sorensen

2022-11-28

Create simulated data

Function for running models, given a simulated dataset

Run scenarios

Compile results

Plots