SD calculation for binomial models

fabians — Jul 1, 2014, 6:18 PM

# test binomial CI for conventional model:

# Fit gam-model for binomial data for binary data or 
#   for equivalent grouped/binomial data.
# y|x ~ B(size=ntrials, p=invlogit( f(x) ) )
# with n = ntrials * #(unique x-value) * #(replicates per unique x-value)
eval_bin_gam <- function(nunique=100, # unique x-values
                         ntrials = 1,
                         n=1e4){
    require(mgcv)
    xu <- seq(0, 1, l=nunique)
    nrep <- n/nunique/ntrials
    x <- rep(xu, e=nrep)
    eta <- dbeta(x, 3, 4) - 1
    y <- rbinom(n=length(x), size=ntrials, p=plogis(eta))
    yrel <- y/ntrials

    wts <- rep(ntrials, n/ntrials)
    m <- gam(yrel ~ s(x), family="binomial", weights=wts)

    # vis:
    plot(m, select=1)
    curve(dbeta(x, 3, 4) - 1, col=2, add=TRUE)

    #return coverage & error
    fit <- predict(m, se=TRUE, weights=wts)
    ci <- cbind(fit$fit - 1.96 * fit$se, 
                fit$fit + 1.96 * fit$se) 
    ## ci above misses effect of the weights (?!?):
    ci_correct <- cbind(fit$fit - 1.96*fit$se * sqrt(wts), 
                        fit$fit + 1.96*fit$se * sqrt(wts)) 
    c(rmse = sqrt(mean( (fit$fit - eta)^2 )), 
      cover= mean( (eta > ci[,1]) & (eta < ci[,2])),
      cover_correct= mean( (eta > ci_correct[,1]) & (eta < ci_correct[,2])))
}

################################################################################
## binary data:
set.seed(11212)
par(mfrow=c(3,4))
t(replicate(12, eval_bin_gam(ntrials = 1)))
Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-0. For overview type 'help("mgcv-package")'.

plot of chunk unnamed-chunk-1

         rmse cover cover_correct
 [1,] 0.04076  0.99          0.99
 [2,] 0.06860  0.91          0.91
 [3,] 0.06228  0.97          0.97
 [4,] 0.04654  1.00          1.00
 [5,] 0.04526  1.00          1.00
 [6,] 0.05904  1.00          1.00
 [7,] 0.06735  0.93          0.93
 [8,] 0.05141  1.00          1.00
 [9,] 0.06273  0.85          0.85
[10,] 0.06140  0.90          0.90
[11,] 0.06493  0.97          0.97
[12,] 0.07678  0.86          0.86

################################################################################
## equivalent binomial data: 5 trials (reduced sample size 5-fold to compensate)
set.seed(11212)
par(mfrow=c(3,4))
t(replicate(12, eval_bin_gam(ntrials = 5)))

plot of chunk unnamed-chunk-1

         rmse cover cover_correct
 [1,] 0.07057  0.44          0.98
 [2,] 0.04782  0.75          1.00
 [3,] 0.05869  0.61          0.94
 [4,] 0.05462  0.63          0.99
 [5,] 0.05373  0.71          0.99
 [6,] 0.05589  0.59          1.00
 [7,] 0.08167  0.60          0.79
 [8,] 0.06130  0.47          0.97
 [9,] 0.04123  0.97          0.99
[10,] 0.06026  0.46          0.89
[11,] 0.05602  0.79          0.97
[12,] 0.04897  0.55          1.00
# --> MSEs comparable to binary data, original CIs too narrow 

################################################################################
## equivalent binomial data: 100 trials (reduced sample size 100-fold to compensate)
set.seed(11212)
par(mfrow=c(3,4))
t(replicate(12, eval_bin_gam(ntrials = 100)))

plot of chunk unnamed-chunk-1

         rmse cover cover_correct
 [1,] 0.06303  0.15          0.99
 [2,] 0.05464  0.06          1.00
 [3,] 0.06567  0.27          0.90
 [4,] 0.06636  0.15          0.87
 [5,] 0.04569  0.26          0.95
 [6,] 0.07153  0.11          0.93
 [7,] 0.08274  0.13          0.81
 [8,] 0.07859  0.04          0.71
 [9,] 0.06669  0.13          0.93
[10,] 0.05695  0.11          0.99
[11,] 0.05185  0.26          1.00
[12,] 0.06382  0.33          0.77
# --> MSEs comparable to binary data, original CIs MUCH too narrow 


################################################################################
## fit binary and grouped on exactly the same data 
##    (... should yield same estimates & sd's):

xu <- seq(0, 1, l=100)
reps <- 20
x <- rep(xu, each=reps)
eta <- dbeta(x, 3, 4) - 1
y <- rbinom(n=length(x), size=1, p=plogis(eta))

# group y over xu:
y_gr <- tapply(y, x, mean)

#fit on binary data:
m <- gam(y ~ s(x), family="binomial")
# fit on grouped data:
m_gr <- gam(y_gr ~ s(xu), family="binomial", weights=rep(reps, 100))

summary(m$coefficients - m_gr$coefficients)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-1.66e-06 -1.85e-07  8.99e-08  9.87e-08  5.86e-07  1.46e-06 
summary(predict(m, newdata=list(x=xu)) - predict(m_gr, newdata=list(xu=xu)))
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-2.49e-07 -1.20e-07 -2.40e-09  1.12e-08  1.28e-07  3.97e-07 
## -> practically same fit

summary(diag(m$Vp)/diag(m_gr$Vp))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   23.6    23.6    23.6    23.6    23.6    23.6 
## --> approx <reps>


################################################################################
sessionInfo()
R version 3.0.2 (2013-09-25)
Platform: x86_64-pc-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] mgcv_1.8-0   nlme_3.1-111 knitr_1.5   

loaded via a namespace (and not attached):
[1] evaluate_0.5.1  formatR_0.10    grid_3.0.2      lattice_0.20-24
[5] Matrix_1.1-4    stringr_0.6.2   tools_3.0.2