Create simulated data
create_simulated_data <- function(
n_iterations, n_obs_per_iteration,
x_range, b0, b1, population_size, SE, SP,
seed_default = 0) {
if (0) {
n_iterations <- 10
n_obs_per_iteration <- 50
b0 <- -3
b1 <- -0.15
x_range <- c(0, 10)
population_size <- 100
seed_default <- 0
SE <- 0.9
SP <- 1.0
}
require(dplyr)
source("functions.R")
df_simtmp <- data.frame(row_id = 1:n_obs_per_iteration) %>%
mutate(
x1 = seq(from = x_range[1], to = x_range[2], length.out = nrow(.)),
y_logit_true = b0 + b1*x1,
p_true = plogis(y_logit_true), # convert from logit to linear space
pop = population_size
)
for (i in 1:n_iterations) {
if (0) {
i <- 1
}
set.seed(seed_default + i)
prob_vals <- df_simtmp$p_true
mat_method2 <- matrix(data = as.character(NA), nrow = population_size, ncol = length(prob_vals))
for (j in 1:length(prob_vals)) {
if (0) {
j <- 1
}
ct_tmp <- make_crosstable(
mu = prob_vals[j],
sens_tmp = SE,
spec_tmp = SP
)
ct_samp <- sample_from_crosstable(n_samples = 1, pop_size = population_size, ct_tmp = ct_tmp)
mat_method2[, j] <- ct_samp$df_vals1[[1]]$x1
}
mat_method3 <- matrix(mat_method2 %in% c("tp", "fp"), ncol = dim(mat_method2)[2])
prevs1 <- colSums(mat_method3) / population_size
df_simtmp[, paste0("p", i)] <- prevs1
}
return(df_simtmp)
}
Function for running models, given a simulated dataset
run_models <- function(simulated_data) {
if (0) {
simulated_data <- create_simulated_data(
n_iterations = 10, n_obs_per_iteration = 50, b0 = qlogis(0.1), b1 = 0,
x_range = c(0, 10), population_size = 100, seed_default = 0, SE = 0.9, SP = 1.0
)
}
y_vars <- names(simulated_data)[names(simulated_data) %in% paste0("p", 1:1000)]
results1 <- sapply(y_vars, function(var) {
if (0) {
var <- y_vars[1]
}
df_iter <- simulated_data[, c("x1", var, "pop")]
names(df_iter) <- c("x1", "pvar_tmp", "pop")
df_iter$count <- df_iter$pvar_tmp * df_iter$pop
df_pred <- data.frame(x1 = 0, pop = 1)
ymat <- df_iter %>%
mutate(
noncount = pop - count ) %>%
.[, c("count", "noncount")]
#####
# fit binomial
fit_binomial <- glm(
formula = cbind(ymat$count, ymat$noncount) ~ 1,
data = df_iter,
family = binomial()
)
rd_binomial <- deviance(fit_binomial)
aic_binomial <- AIC(fit_binomial)
(b0pred_binomial <- predict(fit_binomial, newdata = df_pred, type = "response"))
#####
# fit poisson
fit_poisson <- glm(
# formula = count ~ x1 + offset(log(pop)),
formula = count ~ 1 + offset(log(pop)),
data = df_iter,
family = poisson()
)
rd_poisson <- deviance(fit_poisson)
aic_poisson <- AIC(fit_poisson)
(b0pred_poisson <- predict(fit_poisson, newdata = df_pred, type = "response"))
#####
# fit quasipoisson
# -- formula for extracting log-likelihood from a quasi model is from here (page 2):
# https://cran.r-project.org/web/packages/bbmle/vignettes/quasi.pdf
fit_qpois <- glm(
formula = count ~ 1 + offset(log(pop)),
family = quasipoisson(), data = df_iter)
# deviance(fit_qpois)
# (deviance(fit_qpois)-deviance(fit_qpois))
# If we square the Deviance Residuals and add them all up, we get the Residual Deviance statistic
# https://data.library.virginia.edu/understanding-deviance-residuals/
rd_qpois <- sum(residuals(fit_qpois, type = "deviance")^2)
require(MuMIn)
# adapted from:
# https://search.r-project.org/CRAN/refmans/MuMIn/html/QAIC.html
hacked.quasipoisson <- function(...) {
res <- quasipoisson(...)
res$aic <- poisson(...)$aic
res
}
chat_tmp <- deviance(fit_poisson) / fit_poisson$df.residual
# chat_tmp <- deviance(fit_qpois) / fit_qpois$df.residual
aic_qpois <- QAIC(update(fit_poisson, family = hacked.quasipoisson), chat = chat_tmp)
(b0pred_qpois <- predict(fit_qpois, newdata = df_pred, type = "response"))
# https://cran.r-project.org/web/packages/bbmle/vignettes/quasi.pdf
# Comparing Poisson and quasi-Poisson...
# "The deviance (deviance(glmOT.D93)=5.129 is not the same as −2L (-2*logLik(glmOT.D93)=46.761),
# but the calculated differences in deviance are consistent, and are also extractable
# from the quasi- fit even though the log-likelihood is NA"
#####
# fit beta-binomial
require(aod)
fit_bbinom <- aod::betabin(
# formula = cbind(ymat$count, ymat$noncount) ~ x1,
formula = cbind(ymat$count, ymat$noncount) ~ 1,
random = ~0,
# random = ~1,
data = df_iter
)
rd_bbinom <- deviance(fit_bbinom)
tmp <- summary(fit_bbinom)
aic_bbinom <- (2*tmp@object@nbpar) - (2*tmp@object@logL)
(b0pred_bbinom <- predict(fit_bbinom, newdata = df_pred))
#####
# fit tobit
require(AER)
fit_tobit2 <- AER::tobit(
# formula = pvar_tmp ~ x1,
formula = pvar_tmp ~ 1,
left = 0, right = Inf, data = df_iter)
loglik_tobit <- logLik(fit_tobit2)
rd_tobit <- sum(residuals(fit_tobit2, type = "deviance")^2)
aic_tobit <- -1 * AIC(fit_tobit2) # maybe needs to be flipped for comparability with the others
# aic_tobit <- AIC(fit_tobit2) # maybe needs to be flipped for comparability with the others
(b0pred_tobit <- predict(fit_tobit2, newdata = df_pred))
# adapted from: https://stats.stackexchange.com/questions/149091/censored-regression-in-r
mu <- unique(fitted(fit_tobit2))
sigma <- fit_tobit2$scale
p0 <- pnorm(mu/sigma)
lambda <- function(x) dnorm(x)/pnorm(x)
ey0 <- mu + sigma * lambda(mu/sigma)
ey <- p0 * ey0
out <- list(
rd_binomial=rd_binomial, aic_binomial=aic_binomial, b0pred_binomial=b0pred_binomial,
rd_poisson=rd_poisson, aic_poisson=aic_poisson, b0pred_poisson=b0pred_poisson,
rd_qpois=rd_qpois, aic_qpois=aic_qpois, b0pred_qpois=b0pred_qpois,
rd_bbinom=rd_bbinom, aic_bbinom=aic_bbinom, b0pred_bbinom=b0pred_bbinom,
rd_tobit=rd_tobit, aic_tobit=aic_tobit, b0pred_tobit=b0pred_tobit,
b0pred_tobit2=ey
)
return(out)
}, simplify = FALSE)
return(results1)
}
Run scenarios
b0_true <- qlogis(0.02)
# b0_true <- qlogis(0.15)
scenarios1 <- sapply(as.character(c(1.0, 0.9, 0.8, 0.7)), function(x) {
if (0) {
x <- 0.9
}
cat(x, "\n")
simdata_tmp <- create_simulated_data(
n_iterations = 50, n_obs_per_iteration = 100, b0 = b0_true, b1 = 0,
x_range = c(0, 10), population_size = 100, seed_default = 0, SE = as.numeric(x), SP = 1.0
)
results_tmp <- run_models(simulated_data = simdata_tmp)
if (0) {
str(results_tmp$p1)
}
out <- list(
results_tmp=results_tmp,
simdata_tmp=simdata_tmp
)
return(out)
}, simplify = FALSE)
## 1
## 0.9
## 0.8
## 0.7
Compile results
df_final <- bind_rows(lapply(names(scenarios1), function(x) {
if (0) {
x <- names(scenarios1)[1]
}
require(tidyr)
require(stringr)
dat_tmp <- bind_rows(scenarios1[[x]]$results_tmp) %>%
mutate(
iter = 1:nrow(.),
scenario = x ) %>%
pivot_longer(!iter & !scenario) %>%
separate(name, into = c("metric", "distribution"), sep = "_")
}))
df_alldata <- bind_rows(lapply(names(scenarios1), function(x) {
if (0) {
x <- names(scenarios1)[1]
}
dat_tmp1 <- bind_rows(scenarios1[[x]]$simdata_tmp)
dat_tmp2 <- dat_tmp1[names(dat_tmp1)[names(dat_tmp1) %in% paste0("p", 1:1000)]] %>%
mutate(
scenario = x ) %>%
pivot_longer(!scenario) %>%
mutate(iter = as.numeric(gsub("p", "", name))) %>%
select(-name)
}))
Plots
require(ggplot2)
require(ggExtra)
ggplot2::ggplot(data = df_alldata) +
geom_histogram(mapping = aes(x = value)) +
ggtitle("Data distribution; all iterations pooled") +
facet_wrap(~scenario, ncol = 1) +
geom_vline(xintercept = plogis(b0_true))

ggplot2::ggplot(data = df_final %>% filter(metric == "b0pred")) +
geom_violin(mapping = aes(x = distribution, y = value)) +
ggtitle("Prediction of b0") +
facet_wrap(~scenario) +
geom_hline(yintercept = plogis(b0_true))

ggplot2::ggplot(data = df_final %>% filter(metric == "aic")) +
geom_violin(mapping = aes(x = distribution, y = value)) +
facet_wrap(~scenario) +
ggtitle("AIC")

ggplot2::ggplot(data = df_final %>% filter(metric == "rd")) +
geom_violin(mapping = aes(x = distribution, y = value)) +
facet_wrap(~scenario) +
ggtitle("Residual deviance")
