Define the simulation function
run_sim1 <- function(studyspecific_probs, n_obs_per_study, seed = 1) {
if (0) {
studyspecific_probs = seq(0.1, 0.5, by = 0.05)
n_obs_per_study = 20
seed <- 1
}
require(dplyr)
set.seed(seed)
n_studies <- length(studyspecific_probs)
n_obs_total <- n_studies*n_obs_per_study
df1 <- data.frame(row_id = 1:n_obs_total) %>%
mutate(
study_id = rep(1:n_studies, each = n_obs_per_study),
# draw study-specific p from Unif(l,u) first, then take bernoulli sample for each individual
prob1 = rep(studyspecific_probs, each = n_obs_per_study),
draw1 = rbinom(n = nrow(.), size = 1, prob = prob1),
# for each individual, draw a new p from Unif(l,u) and use it to take a bernoulli sample
prob2 = sample(x = studyspecific_probs, size = nrow(.), replace = TRUE),
draw2 = rbinom(n = nrow(.), size = 1, prob = prob2)
)
binom_mean <- plogis(mean(qlogis(studyspecific_probs)))
binom_se <- sqrt((binom_mean / (1-binom_mean)) / n_obs_total)
df2 <- df1 %>%
group_by(study_id) %>%
summarize(
prev1 = sum(draw1) / n(),
prev2 = sum(draw2) / n()
)
metrics1 <- with(df2, list(
mean1 = mean(prev1), sd1 = sd(prev1),
mean2 = mean(prev1), sd2 = sd(prev2)
))
out <- list(df1=df1, df2=df2, metrics1=metrics1, binom_se=binom_se)
return(out)
}
Run the sim on random realizations
n_iterations <- 100
results1 <- lapply(1:n_iterations, function(i) {
run_sim1(
studyspecific_probs = seq(0.1, 0.5, by = 0.05),
n_obs_per_study = 20, seed = i
)
})
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Show the data; first iteration only
Rows unique by individual
library(DT)
DT::datatable(data = results1[[1]]$df1)
Rows unique by study (after calculating prevalence)
DT::datatable(data = results1[[1]]$df2)
Plot the results
library(tidyr)
df_results <- bind_rows(lapply(results1, function(x) {
as.data.frame(append(x$metrics1, list(binom_se = x$binom_se)))
})) %>%
mutate(iter = row_number()) %>%
pivot_longer(cols = -iter, names_to = "metric")
library(ggplot2)
df_plot1 <- df_results %>% filter(metric %in% c("sd1", "sd2"))
ggplot2::ggplot(data = df_plot1) +
ggplot2::ylim(c(0, NA)) +
geom_violin(mapping = aes(x = metric, y = value)) +
ggtitle("Standard deviation of prevalence estimates (100 iterations)") +
facet_wrap(~metric)
