On 2020-05-01, 1356 samples (out of 3000) have been collected from a random sample of people. You can read more about the study here. From media reports I gather that two persons have been found to be positive. This is the case for when around 1200 samples have been analyzed so in time, this information may be a bit off.
Assuming 1368 samples have been collected and tested and 2 came up negative, this would make prevalence \(\pi = \frac{2}{1368} = 0.001462\). But what are the confidence and credible intervals around this? Let’s see how two methods fare.
We use generalized linear model to get maximum likelihood parameter (intercept) estimate and a profiled 95 % confidence interval. We use Metropolis-Hastings sampling to obtain Bayesian credible 95 % credible intervals.
Code for MH sampler adapted from here.
posterior <- function(Y, X, beta) {
prob <- expit(beta[1])
like <- sum(dbinom(x = Y, size = 1, prob = prob, log = TRUE))
prior <- sum(dnorm(x = beta, mean = 0, sd = 10, log = TRUE))
like + prior
}
expit <- function(x) {
exp(x) / (1 + exp(x))
}
bayesianLogistic <- function(Y, X, prev.init, n.samples = 10000, can.sd = 10) {
beta <- rnorm(1, mean = prev.init, sd = 1)
keep.beta <- matrix(0, nrow = n.samples, ncol = length(beta))
keep.beta[1, ] <- beta
acc <- att <- 0
curlp <- posterior(Y = Y, X = X, beta = beta)
for (i in 2:n.samples) {
att <- att + 1
canbeta <- beta
canbeta <- rnorm(n = 1, mean = beta, sd = can.sd)
canlp <- posterior(Y = Y, X = X, beta = canbeta)
R <- exp(canlp - curlp)
U <- runif(1)
if (U < R) {
beta <- canbeta
curlp <- canlp
acc <- acc + 1
}
keep.beta[i, ] <- beta
}
list(beta = keep.beta, acc.rate = acc / att)
}
# https://www.tutorialspoint.com/r/r_mean_median_mode.htm
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
#' @param pos Integer. Number of positive cases.
#' @param smp Integer. Number of all samples
runSimulation <- function(pos, smp, ylimit, yoffset, xstep) {
set.seed(357)
Y <- c(rep(0, smp - pos), rep(1, pos))
X <- rep(1, length(Y))
mdl <- glm(Y ~ 1, family = binomial)
mdl.smr <- summary(mdl)
ci.ml <- mdl.smr$family$linkinv(confint(mdl))
coef.ml <- mdl.smr$family$linkinv(coef(mdl))
Y.ml <- data.frame(lci = ci.ml[1], y = coef.ml, uci = ci.ml[2])
rownames(Y.ml) <- NULL
Y.ml
burn <- 10000
n.samples <- 50000
fit <- bayesianLogistic(Y, X, prev.init = Y.ml$y, n.samples = n.samples,
can.sd = 0.5)
Y.b <- fit$beta[burn:n.samples, ]
Y.b <- plogis(Y.b)
bins <- 250
vjust <- 1.5
histbi <- hist(Y.b, plot = FALSE, breaks = bins)
histbi <- data.frame(histbi[c("mids", "counts")])
yoffset <- max(histbi$counts) * 0.95
Y.b.quant <- quantile(Y.b, probs = c(0.025, 0.975))
Y.b.quant <- data.frame(quants = Y.b.quant,
y = yoffset,
label = round(Y.b.quant, 6))
ggplot(data.frame(Y.b), aes(x = Y.b)) +
theme_bw() +
geom_col(data = histbi,
aes(x = mids, y = counts),
alpha = 0.75) +
scale_x_continuous(breaks = seq(from = 0,
to = max(histbi$mids),
by = xstep)) +
scale_y_continuous(limits = c(0, max(histbi$counts)) * 1.1) +
xlab("Posterior") + ylab("Frequency") +
geom_vline(xintercept = Y.ml$y,
color = "blue",
size = 0.75) +
geom_vline(xintercept = Y.ml$uci,
color = "blue",
linetype = "dashed",
size = 0.75) +
geom_vline(xintercept = Y.ml$lci,
color = "blue",
linetype = "dashed",
size = 0.75) +
geom_text(data = Y.b.quant,
aes(x = Y.ml$y,
y = yoffset,
label = round(Y.ml$y, 6)),
color = "blue",
angle = 90,
vjust = vjust) +
geom_text(data = Y.b.quant,
aes(x = Y.ml$lci,
y = yoffset,
label = round(Y.ml$lci, 6)),
color = "blue",
angle = 90,
vjust = vjust) +
geom_text(data = Y.b.quant,
aes(x = Y.ml$uci,
y = yoffset,
label = round(Y.ml$uci, 6)),
color = "blue",
angle = 90,
vjust = vjust) +
geom_vline(xintercept = getmode(Y.b.quant$quants),
color = "red") +
geom_vline(xintercept = Y.b.quant$quants,
color = "red") +
geom_text(data = Y.b.quant,
aes(x = quants,
y = y,
label = label),
color = "red",
angle = 90,
vjust = -0.43)
}
Blue solid line represents point estimate of ML method and dashed lines represent 95 % confidence interval. Red lines represent credible intervals of the Bayesian analysis.
runSimulation(pos = 2, smp = 1356, xstep = 0.001)
What would happen to posterior distribution and 95 % ML confidence intervals if we only had 1 positive case (ratio of 0.000737)?
runSimulation(pos = 1, smp = 1356, xstep = 0.001)
What about 5 (ration of 0.00369)?
runSimulation(pos = 5, smp = 1356, xstep = 0.001)
What about 50 (ratio of 0.0369)?
runSimulation(pos = 50, smp = 1356, xstep = 0.005)