References

Assumptions

Load packages

library(magrittr)
library(XLConnect)
library(rstan)

Load data

datFile <- "nhefs_book.xls"
if(!file.exists(datFile)) {
    download.file(url = "http://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/2013/01/nhefs_book.xls",
                  destfile = datFile)
}
nhefs <- readWorksheetFromFile(datFile, sheet = 1)
## Complete cases only
nhefs <- nhefs[!is.na(nhefs$death),]
dim(nhefs)
## [1] 1746   61

Regular frequentist approach

outcomeModel <- glm(death ~ qsmk + sex + age + race + smokeyrs, data = nhefs,
                    family = binomial(link = "logit"))
summary(outcomeModel)
## 
## Call:
## glm(formula = death ~ qsmk + sex + age + race + smokeyrs, family = binomial(link = "logit"), 
##     data = nhefs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7308  -0.5944  -0.3096  -0.1732   2.9669  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.69921    0.40023 -16.738  < 2e-16 ***
## qsmk        -0.01459    0.15739  -0.093  0.92615    
## sex         -0.46693    0.15231  -3.066  0.00217 ** 
## age          0.09111    0.01171   7.779 7.34e-15 ***
## race         0.43653    0.20381   2.142  0.03220 *  
## smokeyrs     0.03526    0.01112   3.170  0.00152 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1707.3  on 1745  degrees of freedom
## Residual deviance: 1266.1  on 1740  degrees of freedom
## AIC: 1278.1
## 
## Number of Fisher Scoring iterations: 5

Bayesian approach using Stan

## Create Stan data
dat <- list(N        = nrow(nhefs),
            p        = 6,
            death    = nhefs$death,
            qsmk     = nhefs$qsmk,
            sex      = nhefs$sex,
            age      = nhefs$age,
            race     = nhefs$race,
            smokeyrs = nhefs$smokeyrs)

## Load Stan file
fileName <- "./hernan_book_logistic.stan"
stan_code <- readChar(fileName, file.info(fileName)$size)
cat(stan_code)
## data {
##   // Define variables in data
##   // Number of observations (an integer)
##   int<lower=0> N;
##   // Number of parameters
##   int<lower=0> p;
##   // Variables
##   int death[N];
##   int<lower=0>  qsmk[N];
##   int<lower=0>  sex[N];
##   real<lower=0> age[N];
##   int<lower=0>  race[N];
##   real<lower=0> smokeyrs[N];
## }
## 
## parameters {
##   // Define parameters to estimate
##   real beta[p];
## }
## 
## transformed parameters  {
##   // Probability trasformation from linear predictor
##   real<lower=0> odds[N];
##   real<lower=0, upper=1> prob[N];
## 
##   for (i in 1:N) {
##     odds[i] <- exp(beta[1] + beta[2]*qsmk[i] + beta[3]*sex[i] + beta[4]*age[i] + beta[5]*race[i] + beta[6]*smokeyrs[i]);
##     prob[i] <- odds[i] / (odds[i] + 1);
##   }
## }
## 
## model {
##   // Prior part of Bayesian inference (flat if unspecified)
## 
##   // Likelihood part of Bayesian inference
##     death ~ bernoulli(prob);
## }
## Run Stan
resStan <- stan(model_code = stan_code, data = dat,
                chains = 3, iter = 3000, warmup = 500, thin = 10)

## Show traceplot
traceplot(resStan, pars = c("beta"), inc_warmup = TRUE)

Comparison

## Bayesian
print(resStan, pars = c("beta"))
## Inference for Stan model: stan_code.
## 3 chains, each with iter=3000; warmup=500; thin=10; 
## post-warmup draws per chain=250, total post-warmup draws=750.
## 
##          mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## beta[1] -6.75    0.02 0.40 -7.50 -7.02 -6.74 -6.46 -5.98   712    1
## beta[2] -0.02    0.01 0.16 -0.33 -0.12 -0.01  0.09  0.28   750    1
## beta[3] -0.46    0.01 0.15 -0.73 -0.57 -0.46 -0.36 -0.16   668    1
## beta[4]  0.09    0.00 0.01  0.07  0.08  0.09  0.10  0.11   605    1
## beta[5]  0.43    0.01 0.21  0.02  0.29  0.42  0.57  0.84   750    1
## beta[6]  0.04    0.00 0.01  0.01  0.03  0.04  0.04  0.06   672    1
## 
## Samples were drawn using NUTS(diag_e) at Tue Feb 24 04:47:18 2015.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
## Frequentist
tableone::ShowRegTable(outcomeModel, exp = FALSE)
##             beta [confint]       p     
## (Intercept) -6.70 [-7.50, -5.93] <0.001
## qsmk        -0.01 [-0.33, 0.29]   0.926
## sex         -0.47 [-0.77, -0.17]  0.002
## age          0.09 [0.07, 0.11]   <0.001
## race         0.44 [0.03, 0.83]    0.032
## smokeyrs     0.04 [0.01, 0.06]    0.002