fabians — Jul 1, 2014, 6:18 PM
# test binomial CI for conventional model:
# Fit gam-model for binomial data for binary data or
# for equivalent grouped/binomial data.
# y|x ~ B(size=ntrials, p=invlogit( f(x) ) )
# with n = ntrials * #(unique x-value) * #(replicates per unique x-value)
eval_bin_gam <- function(nunique=100, # unique x-values
ntrials = 1,
n=1e4){
require(mgcv)
xu <- seq(0, 1, l=nunique)
nrep <- n/nunique/ntrials
x <- rep(xu, e=nrep)
eta <- dbeta(x, 3, 4) - 1
y <- rbinom(n=length(x), size=ntrials, p=plogis(eta))
yrel <- y/ntrials
wts <- rep(ntrials, n/ntrials)
m <- gam(yrel ~ s(x), family="binomial", weights=wts)
# vis:
plot(m, select=1)
curve(dbeta(x, 3, 4) - 1, col=2, add=TRUE)
#return coverage & error
fit <- predict(m, se=TRUE, weights=wts)
ci <- cbind(fit$fit - 1.96 * fit$se,
fit$fit + 1.96 * fit$se)
## ci above misses effect of the weights (?!?):
ci_correct <- cbind(fit$fit - 1.96*fit$se * sqrt(wts),
fit$fit + 1.96*fit$se * sqrt(wts))
c(rmse = sqrt(mean( (fit$fit - eta)^2 )),
cover= mean( (eta > ci[,1]) & (eta < ci[,2])),
cover_correct= mean( (eta > ci_correct[,1]) & (eta < ci_correct[,2])))
}
################################################################################
## binary data:
set.seed(11212)
par(mfrow=c(3,4))
t(replicate(12, eval_bin_gam(ntrials = 1)))
Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-0. For overview type 'help("mgcv-package")'.
rmse cover cover_correct
[1,] 0.04076 0.99 0.99
[2,] 0.06860 0.91 0.91
[3,] 0.06228 0.97 0.97
[4,] 0.04654 1.00 1.00
[5,] 0.04526 1.00 1.00
[6,] 0.05904 1.00 1.00
[7,] 0.06735 0.93 0.93
[8,] 0.05141 1.00 1.00
[9,] 0.06273 0.85 0.85
[10,] 0.06140 0.90 0.90
[11,] 0.06493 0.97 0.97
[12,] 0.07678 0.86 0.86
################################################################################
## equivalent binomial data: 5 trials (reduced sample size 5-fold to compensate)
set.seed(11212)
par(mfrow=c(3,4))
t(replicate(12, eval_bin_gam(ntrials = 5)))
rmse cover cover_correct
[1,] 0.07057 0.44 0.98
[2,] 0.04782 0.75 1.00
[3,] 0.05869 0.61 0.94
[4,] 0.05462 0.63 0.99
[5,] 0.05373 0.71 0.99
[6,] 0.05589 0.59 1.00
[7,] 0.08167 0.60 0.79
[8,] 0.06130 0.47 0.97
[9,] 0.04123 0.97 0.99
[10,] 0.06026 0.46 0.89
[11,] 0.05602 0.79 0.97
[12,] 0.04897 0.55 1.00
# --> MSEs comparable to binary data, original CIs too narrow
################################################################################
## equivalent binomial data: 100 trials (reduced sample size 100-fold to compensate)
set.seed(11212)
par(mfrow=c(3,4))
t(replicate(12, eval_bin_gam(ntrials = 100)))
rmse cover cover_correct
[1,] 0.06303 0.15 0.99
[2,] 0.05464 0.06 1.00
[3,] 0.06567 0.27 0.90
[4,] 0.06636 0.15 0.87
[5,] 0.04569 0.26 0.95
[6,] 0.07153 0.11 0.93
[7,] 0.08274 0.13 0.81
[8,] 0.07859 0.04 0.71
[9,] 0.06669 0.13 0.93
[10,] 0.05695 0.11 0.99
[11,] 0.05185 0.26 1.00
[12,] 0.06382 0.33 0.77
# --> MSEs comparable to binary data, original CIs MUCH too narrow
################################################################################
## fit binary and grouped on exactly the same data
## (... should yield same estimates & sd's):
xu <- seq(0, 1, l=100)
reps <- 20
x <- rep(xu, each=reps)
eta <- dbeta(x, 3, 4) - 1
y <- rbinom(n=length(x), size=1, p=plogis(eta))
# group y over xu:
y_gr <- tapply(y, x, mean)
#fit on binary data:
m <- gam(y ~ s(x), family="binomial")
# fit on grouped data:
m_gr <- gam(y_gr ~ s(xu), family="binomial", weights=rep(reps, 100))
summary(m$coefficients - m_gr$coefficients)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.66e-06 -1.85e-07 8.99e-08 9.87e-08 5.86e-07 1.46e-06
summary(predict(m, newdata=list(x=xu)) - predict(m_gr, newdata=list(xu=xu)))
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.49e-07 -1.20e-07 -2.40e-09 1.12e-08 1.28e-07 3.97e-07
## -> practically same fit
summary(diag(m$Vp)/diag(m_gr$Vp))
Min. 1st Qu. Median Mean 3rd Qu. Max.
23.6 23.6 23.6 23.6 23.6 23.6
## --> approx <reps>
################################################################################
sessionInfo()
R version 3.0.2 (2013-09-25)
Platform: x86_64-pc-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] mgcv_1.8-0 nlme_3.1-111 knitr_1.5
loaded via a namespace (and not attached):
[1] evaluate_0.5.1 formatR_0.10 grid_3.0.2 lattice_0.20-24
[5] Matrix_1.1-4 stringr_0.6.2 tools_3.0.2