library(ggplot2)
library(parallel)

# Needed for parallel calculations.
cl <- makeCluster(7, outfile = "clusterfuck.txt")
invisible(clusterEvalQ(cl = cl, expr = source("simulation_functions.R")))

source("simulation_functions.R", max.deparse.length = 1000, echo = TRUE)

## 
## > addFNFPbias <- function(x, fp, fn) {
## +     x <- sort(x)
## +     zeros <- which(x == 0)
## +     ones <- which(x == 1)
## +     if (fp != 0) {
## +         new.fp <- rbinom(n = length(zeros), size = 1, prob = fp)
## +         x[zeros] <- new.fp
## +     }
## +     if (fn != 0) {
## +         new.fn <- rbinom(n = length(ones), size = 1, prob = 1 - 
## +             fn)
## +         x[ones] <- new.fn
## +     }
## +     x
## + }
## 
## > simulateSampling <- function(my.N, myprob, fp, fn) {
## +     vec <- rbinom(n = my.N, size = 1, prob = myprob)
## +     vec <- addFNFPbias(x = vec, fp = fp, fn = fn)
## +     mdl <- glm(vec ~ 1, family = binomial)
## +     cis <- suppressMessages(plogis(confint(mdl)))
## +     out <- as.data.frame(t(cis))
## +     out$N <- my.N
## +     out$y <- plogis(coef(mdl))
## +     out
## + }
## 
## > simulateStudy <- function(N, myseed = NULL, myprob, 
## +     propN, fp = 0, fn = 0) {
## +     if (is.null(myseed)) {
## +         myseed <- sample(1:10^9, size = 1)
## +     }
## +     set.seed(myseed)
## +     sims <- replicate(propN, simulateSampling(my.N = N, myprob = myprob, 
## +         fp = fp, fn = fn), simplify = FALSE)
## +     sims <- do.call(rbind, sims)
## +     colnames(sims) <- c("lci", "uci", "N", "y")
## +     sims$prob <- myprob
## +     sims$fp <- fp
## +     sims$fn <- fn
## +     sims
## + }
## 
## > simulateShell <- function(params) {
## +     sim1 <- parApply(cl = cl, X = params, MARGIN = 1, FUN = function(x) {
## +         simulateStudy(N = x["N"], myprob = x["myprob"], propN = x["propN"], 
## +             fp = x["fp"], fn = x["fn"])
## +     })
## +     sim1 <- do.call(rbind, sim1)
## + }
## 
## > plotPower <- function(x) {
## +     xsplit <- split(x, list(x$N, x$prob, x$fp, x$fn))
## +     xpower <- sapply(xsplit, FUN = function(p) {
## +         p$power <- ifelse(p$prob >= p$lci & p$prob <= p$uci, 
## +             yes = TRUE, no = FALSE)
## +         p
## +     }, simplify = FALSE)
## +     xmedians <- sapply(xpower, FUN = function(p) {
## +         data.frame(lci = median(p$lci), y = median(p$y), uci = median(p$uci), 
## +             power = mean(p$power), prob = unique(p$prob), N = unique(p$N), 
## +             fp = unique(p$fp), fn = unique(p$fn), powerin = sum(p$power))
## +     }, simplify = FALSE)
## +     x <- do.call(rbind, xmedians)
## +     rownames(x) <- NULL
## +     ggplot(x, aes(x = N, y = powerin, color = as.factor(prob))) + 
## +         theme_bw() + theme(legend.position = "top", legend.direction = "horizontal") + 
## +         scale_color_brewer(palette = "Set1", name = "simulated prevalence") + 
## +         scale_y_continuous(breaks = seq(0, 100, by = 10), limits = c(0, 
## +             100)) + scale_x_c .... [TRUNCATED] 
## 
## > plotCI <- function(x) {
## +     xsplit <- split(x, list(x$N, x$prob, x$fp, x$fn))
## +     xmedians <- sapply(xsplit, FUN = function(p) {
## +         data.frame(lci = median(p$lci), y = mean(p$y), uci = median(p$uci), 
## +             prob = unique(p$prob), N = unique(p$N), fp = unique(p$fp), 
## +             fn = unique(p$fn), powerin = sum(p$power))
## +     }, simplify = FALSE)
## +     x <- do.call(rbind, xmedians)
## +     rownames(x) <- NULL
## +     ggplot(x, aes(x = N, y = y)) + theme_bw() + theme(legend.position = "top") + 
## +         geom_vline(xintercept = 3000, color = "grey60", linetype = "dashed") + 
## +         geom_hline(aes(yintercept = prob), color = "grey60", 
## +             linetype = "dashed") + scale_x_continuous(breaks = seq(0, 
## +         10000, by = 2000)) + geom_pointrange(aes(ymin = lci, 
## +         ymax = uci), size = 0.25) + facet_wrap(~prob, scales = "free_y")
## + }
## 
## > summarizeSimulation <- function(x) {
## +     xsplit <- split(x, list(x$N, x$prob, x$fp, x$fn))
## +     xpower <- sapply(xsplit, FUN = function(p) {
## +         p$power <- ifelse(p$prob >= p$lci & p$prob <= p$uci, 
## +             yes = TRUE, no = FALSE)
## +         p
## +     }, simplify = FALSE)
## +     xmedians <- sapply(xpower, FUN = function(p) {
## +         data.frame(lci = median(p$lci), y = mean(p$y), uci = median(p$uci), 
## +             prob = unique(p$prob), N = unique(p$N), fp = unique(p$fp), 
## +             fn = unique(p$fn), powerin = sum(p$power))
## +     }, simplify = FALSE)
## +     x <- do.call(rbind, xmedians)
## +     rownames(x) <- NULL
## +     x
## + }
## 
## > plotEstimates <- function(x) {
## +     ggplot(x) + theme_bw() + theme(axis.text.x = element_text(size = 8, 
## +         angle = 90, vjust = 0.5)) + geom_histogram(aes(x = y)) + 
## +         geom_vline(aes(xintercept = prob), color = "grey60", 
## +             linetype = "dashed") + facet_grid(N ~ prob, scales = "free_x")
## + }

options(scipen = 14)

No introduction

In order to successfully determine level of prevalence of a disease on a sample of population, we need to gather a sufficiently large sample.

Algorithm

sample N individuals from a binomial distribution \(Y \sim B(1, myprob)\)
1. if kerning used, introduce false positive and false negative
calculate confidence intervals and estimate of prevalence (\(\hat{p}\))
repeat 1-2 propN times \(\rightarrow\) X
calculate proportion of times simulated myprob is within the confidence intervals (within_ci) and calculate its standard deviations (sd) for X

Parameters in:

simulated probability (prob)
simulated population size (N)
number of times simulation is run for each N (propN)

Parameters out:

lower confidence intervals of \(\hat{p}\)
parameter estimate \(\hat{p}\)
upper confidence interval of \(\hat{p}\)
standard deviation of \(\hat{p}\)
power (in %)

Kerning

Above algorithm is assuming an ideal situation (what some might consider a spherical chicken in a vacuum). To make things more interesting, let’s introduce false positive or false negative rates.

Additional parameters have been introduced, namely fp and fn.

For each simulation, kerning (adjusting of values) has been performed after N samples has been drawn from \(Y \sim B(1, myprob)\). To introduce false positives, a percent of negative cases has been replaced by a vector of equal size where values have been drawn from (\(K_{FP} \sim B(1, fp)\)). To introduce false negative, a percentage of positive cases has been replaced by a vector of equal size where values have been drawn from \(K_{FN} \sim B(1, 1 - fn)\).

Simulation assumptions

sampling is done at random for all variables which influence prevalence (spatial, temporal, demographic, medical…)
there are no test shy responders (responders unwilling to take the test for whatever reason)
false positive and false negative rates are occurring at random

Parameters

Parameters start at 0.00075, which is assuming 1500 infected in a population of 2000000. It is probably safe to assume that number of infected or convalesced would not be below this threshold. Sample sizes go from 1000 to 10000. Lower end because this is close to catching one infected individual at assumed prevalence 0.00075 and upper 10000 because this would be in the upper limits of what this author would consider feasible to sample and test in the current climate.

probs1 <- c(0.00075, 0.001, 0.003, 
            0.005, 0.008, 0.01, 
            0.05, 0.07, 0.1)
propN <- 100
stepN <- seq(from = 1000, to = 10000, by = 1000)

Ideal simulation

In this scenario, there are no false positive or false negative cases. The so called “best case scenario”.

sim1.params <- expand.grid(N = stepN,
                           myprob = probs1, 
                           propN = propN,
                           fp = 0,
                           fn = 0)
sim1 <- simulateShell(params = sim1.params)

plotPower(sim1)

Each point represents average of 100 simulations where simulated value has been covered by 95 % confidence interval - power.

Power appears to be pretty high for all but for the lowest simulated prevalence values of 0.00075 and 0.001. This indicates that with even at least 1000 samples, best case scenario would yield a viable result.

plotCI(sim1)

This graph shows mean estimated prevalence and median lower and upper 95 % confidence intervals. Previous figure showed that a sample of 1000 would give us, for a decently large prevalence (above 0.3 %), good power. But this figure shows us that the intervals would be rather large. For example, for prevalence 0.003, 95 % confidence interval could go from 0.001-0.008 (span of 0.007), whereas for a sample size of 3000 (dashed vertical line), the interval would go from 0.0015-0.005 (span of 0.0035, or half of that for \(N=1000\)).

plotEstimates(sim1)

This figure shows a distribution of estimates around the simulated prevalence value. Notice how estimates take discrete values for low number of samples and low prevalence. Distributions of estimates approximate normal distribution as more data becomes available, which is what we would expect.

Simulation with kerning

The above simulation is done in a perfect world where sampling is 100 % efficient and tests do not make mistakes. The following simulation will introduce bias. Specifically, we will convert positive cases to negative, simulating false negative rate (fn). To keep things simple, no false positives will be introduced.

False negative rate from literature is assumed to be \(fn = 0.33\) (worst case scenario, swab taken 10 days after symptom onset). See Wikramaratna et al. (2020) for discussion on false negative rates. Briefly, they depend on time and location of where the samples were collected (nasal vs throat swab) while low false positive rates are mainly due to RT-PCR method being highly sensitive and specific (e.g. picking up 10 molecules).

Using substantial kerning

A pretty pessimistic scenario would be where false negative rate is 0.33 (33 % of positive cases are not detected as such). False positive rate is set to 0 because we want to examine one process at a time.

sim2.params <- expand.grid(N = stepN,
                           myprob = probs1, 
                           propN = propN,
                           fp = 0,
                           fn = 0.33)
sim2 <- simulateShell(params = sim2.params)

plotPower(sim2)

Compared to the idea scenario, power takes a hit for most simulated prevalence values. Power is decreasing with increased sample size. One explanation would be that with more cases, larger portion of cases can potentially be falsely detected as negative cases, yielding underestimated values.

plotCI(sim2)

Horizontal dashed line represents simulated prevalence values. Notice how estimates are consistently underestimated. Due to large confidence intervals for smaller sample sizes, the power appears to be higher for these cases, but at a coast of accuracy. Higher prevalence values also means higher negative bias.

plotEstimates(sim2)

Similar representation as in the previous image, but the distribution of estimates is even more detailed. For small number of data points (\(N=1000\), \(prob = 0.00075\)), values still take rather discrete estimates and as the number of available data points increases (large number samples or higher prevalence), distributions start to approximate normal distribution. Underestimation is evident with estimates aggregating left of the simulated prevalence value (vertical dashed line).

Using less kerning

Simulating false negative value at 0.1 (10 %).

FN = 0.1

sim3.params <- expand.grid(N = stepN,
                           myprob = probs1, 
                           propN = propN,
                           fp = 0,
                           fn = 0.1)
sim3 <- simulateShell(params = sim3.params)

plotPower(sim3)

Power improves, especially for simulated prevalence values of 0.1, 0.07 and 0.05.

plotCI(sim3)

Bias is still present, but mostly for the upper end of simulated prevalence values.

plotEstimates(sim3)

Underestimation of prevalence compared to simulations with \(fn=0.33\) has decreased.

FN = 0.01

Examine if bias in underestimation of prevalence decreases even further.

sim4.params <- expand.grid(N = stepN,
                           myprob = probs1, 
                           propN = propN,
                           fp = 0,
                           fn = 0.01)
sim4 <- simulateShell(params = sim4.params)

plotPower(sim4)

plotCI(sim4)

plotEstimates(sim4)

It does.

FN = 0.001

More power to the engines! False negative rate set to 0.001.

sim5.params <- expand.grid(N = stepN,
                           myprob = probs1, 
                           propN = propN,
                           fp = 0,
                           fn = 0.001)
sim5 <- simulateShell(params = sim5.params)

plotPower(sim5)

plotCI(sim5)

plotEstimates(sim5)

Bias is not apparent, results appear similar to values of the ideal simulation.

Alternate view of the data

Below images shows ideal and intervention (with kerning) simulations side by side for all simulated prevalence values. Notice the drop in power for simulated prevalence numbers for high false negative rates in conjunction with large sample sizes. My guess would be that this happens because for smaller sample size, kernining has a relatively small effect and as the sample size increases.

sim1summ <- summarizeSimulation(sim1)
sim1summ$source <- "ideal"
sim2summ <- summarizeSimulation(sim2)
sim2summ$source <- "fn=0.33"
sim3summ <- summarizeSimulation(sim3)
sim3summ$source <- "fn=0.1"
sim4summ <- summarizeSimulation(sim4)
sim4summ$source <- "fn=0.01"
sim5summ <- summarizeSimulation(sim5)
sim5summ$source <- "fn=0.001"

sims <- do.call(rbind, list(sim1summ, sim2summ, sim3summ,
                            sim3summ, sim4summ, sim5summ))

ggplot(sims, aes(x = N, y = powerin, color = source)) +
        theme_bw() +
        theme(legend.position = "top", legend.direction = "horizontal") +
        scale_color_brewer(palette = "Set1", name = "simulated conditions") +
        scale_x_continuous(breaks = seq(0, 10000, by = 2000)) +
        geom_vline(xintercept = 3000, color = "grey60", linetype = "dashed") +
        geom_line() +
        geom_point(size = 2, alpha = 0.75) +
        facet_wrap(~ prob)

ggplot(do.call(rbind, list(sim1, sim2, sim3, sim4, sim5)), 
       aes(fill = as.factor(fn))) +
  theme_bw() +
  theme(legend.position = "top", 
        axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5)) +
  scale_fill_brewer(palette = "Set1", name = "Simulated bias (FN)") +
  geom_histogram(aes(x = y), alpha = 0.7, position = "identity") +
  geom_vline(aes(xintercept = prob), color = "grey60", linetype = "dashed") +
  facet_grid(N ~ prob, scale = "free_x")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

There appears to be a sweet spot of false negative rate where its effect is still negligible to the overall experiment, assuming all other assumptions hold. Based on the above result, this would be in single digits.

Used packages

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 31 (Workstation Edition)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=sl_SI.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=sl_SI.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=sl_SI.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=sl_SI.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] ggplot2_3.2.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3         pillar_1.4.3       compiler_3.6.2     RColorBrewer_1.1-2
##  [5] plyr_1.8.5         tools_3.6.2        digest_0.6.23      evaluate_0.14     
##  [9] lifecycle_0.1.0    tibble_2.1.3       gtable_0.3.0       pkgconfig_2.0.3   
## [13] rlang_0.4.4        yaml_2.2.1         xfun_0.12          withr_2.1.2       
## [17] dplyr_0.8.4        stringr_1.4.0      knitr_1.27         grid_3.6.2        
## [21] tidyselect_1.0.0   glue_1.3.1         R6_2.4.1           rmarkdown_2.1     
## [25] purrr_0.3.3        farver_2.0.3       reshape2_1.4.3     magrittr_1.5      
## [29] scales_1.1.0       htmltools_0.4.0    assertthat_0.2.1   colorspace_1.4-1  
## [33] labeling_0.3       stringi_1.4.5      lazyeval_0.2.2     munsell_0.5.0     
## [37] crayon_1.3.4

end <- Sys.time()
simtime <- end - start

Processing took 32.134 secs (results may have been cached, so your experience may differ).

What sample size to estimate infection prevelance, a simulation study

Roman Luštrik

2020-04-18