No introduction

On 2020-05-01, 1356 samples (out of 3000) have been collected from a random sample of people. You can read more about the study here. From media reports I gather that two persons have been found to be positive. This is the case for when around 1200 samples have been analyzed so in time, this information may be a bit off.

Assuming 1368 samples have been collected and tested and 2 came up negative, this would make prevalence \(\pi = \frac{2}{1368} = 0.001462\). But what are the confidence and credible intervals around this? Let’s see how two methods fare.

We use generalized linear model to get maximum likelihood parameter (intercept) estimate and a profiled 95 % confidence interval. We use Metropolis-Hastings sampling to obtain Bayesian credible 95 % credible intervals.

Code for MH sampler adapted from here.

posterior <- function(Y, X, beta) {
  prob <- expit(beta[1])
  like <- sum(dbinom(x = Y, size = 1, prob = prob, log = TRUE))
  prior <- sum(dnorm(x = beta, mean = 0, sd = 10, log = TRUE))
  
  like + prior
}

expit <- function(x) {
  exp(x) / (1 + exp(x))
}

bayesianLogistic <- function(Y, X, prev.init, n.samples = 10000, can.sd = 10) {
  beta <- rnorm(1, mean = prev.init, sd = 1)
  
  keep.beta <- matrix(0, nrow = n.samples, ncol = length(beta))
  keep.beta[1, ] <- beta
  
  acc <- att <- 0
  curlp <- posterior(Y = Y, X = X, beta = beta)
  
  for (i in 2:n.samples) {
    att <- att + 1
    
    canbeta <- beta
    canbeta <- rnorm(n = 1, mean = beta, sd = can.sd)
    canlp <- posterior(Y = Y, X = X, beta = canbeta)
    
    R <- exp(canlp - curlp)
    U <- runif(1)
    
    if (U < R) {
      beta <- canbeta
      curlp <- canlp
      acc <- acc + 1
    }
    
    keep.beta[i, ] <- beta
  }
  
  list(beta = keep.beta, acc.rate = acc / att)
}

# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

#' @param pos Integer. Number of positive cases.
#' @param smp Integer. Number of all samples
runSimulation <- function(pos, smp, ylimit, yoffset, xstep) {
  set.seed(357)
  
  Y <- c(rep(0, smp - pos), rep(1, pos))
  X <- rep(1, length(Y))
  
  mdl <- glm(Y ~ 1, family = binomial)
  
  mdl.smr <- summary(mdl)
  ci.ml <- mdl.smr$family$linkinv(confint(mdl))
  coef.ml <- mdl.smr$family$linkinv(coef(mdl))
  
  Y.ml <- data.frame(lci = ci.ml[1], y = coef.ml, uci = ci.ml[2])
  rownames(Y.ml) <- NULL
  Y.ml
  
  burn <- 10000
  n.samples <- 50000
  fit <- bayesianLogistic(Y, X, prev.init = Y.ml$y, n.samples = n.samples, 
                          can.sd = 0.5)
  
  Y.b <- fit$beta[burn:n.samples, ]
  Y.b <- plogis(Y.b)
  
  bins <- 250
  vjust <- 1.5
  
  histbi <- hist(Y.b, plot = FALSE, breaks = bins)
  histbi <- data.frame(histbi[c("mids", "counts")])
  yoffset <- max(histbi$counts) * 0.95
  
  Y.b.quant <- quantile(Y.b, probs = c(0.025, 0.975))
  Y.b.quant <- data.frame(quants = Y.b.quant,
                          y = yoffset,
                          label = round(Y.b.quant, 6))
  
  ggplot(data.frame(Y.b), aes(x = Y.b)) +
    theme_bw() +
    geom_col(data = histbi,
             aes(x = mids, y = counts),
             alpha = 0.75) +
    scale_x_continuous(breaks = seq(from = 0, 
                                    to = max(histbi$mids), 
                                    by = xstep)) +
    scale_y_continuous(limits = c(0, max(histbi$counts)) * 1.1) +
    xlab("Posterior") + ylab("Frequency") +
    geom_vline(xintercept = Y.ml$y, 
               color = "blue", 
               size = 0.75) +
    geom_vline(xintercept = Y.ml$uci, 
               color = "blue", 
               linetype = "dashed", 
               size = 0.75) +
    geom_vline(xintercept = Y.ml$lci, 
               color = "blue", 
               linetype = "dashed", 
               size = 0.75) +
    geom_text(data = Y.b.quant, 
              aes(x = Y.ml$y, 
                  y = yoffset, 
                  label = round(Y.ml$y, 6)),
              color = "blue", 
              angle = 90, 
              vjust = vjust) +
    geom_text(data = Y.b.quant, 
              aes(x = Y.ml$lci,
                  y = yoffset, 
                  label = round(Y.ml$lci, 6)), 
              color = "blue",
              angle = 90, 
              vjust = vjust) +
    geom_text(data = Y.b.quant, 
              aes(x = Y.ml$uci, 
                  y = yoffset, 
                  label = round(Y.ml$uci, 6)), 
              color = "blue", 
              angle = 90, 
              vjust = vjust) +
    geom_vline(xintercept = getmode(Y.b.quant$quants),
               color = "red") +
    geom_vline(xintercept = Y.b.quant$quants, 
               color = "red") +
    geom_text(data = Y.b.quant, 
              aes(x = quants, 
                  y = y, 
                  label = label),
              color = "red",
              angle = 90, 
              vjust = -0.43)
}

Blue solid line represents point estimate of ML method and dashed lines represent 95 % confidence interval. Red lines represent credible intervals of the Bayesian analysis.

runSimulation(pos = 2, smp = 1356, xstep = 0.001)

Infected 1

What would happen to posterior distribution and 95 % ML confidence intervals if we only had 1 positive case (ratio of 0.000737)?

runSimulation(pos = 1, smp = 1356, xstep = 0.001)

Infected 5

What about 5 (ration of 0.00369)?

runSimulation(pos = 5, smp = 1356, xstep = 0.001)

Infected 50

What about 50 (ratio of 0.0369)?

runSimulation(pos = 50, smp = 1356, xstep = 0.005)

Estimate prevalence of SARS-CoV-2 cases in Slovenia

Roman Luštrik

2020-05-01

No introduction

Infected 1

Infected 5

Infected 50