sex, age, race, and smokeyrs are assumed to be sufficient to address confounding (conditional exchangeability)
age and smokeyrs meet linearity with the log odds.
library(magrittr)
library(XLConnect)
library(rstan)
datFile <- "nhefs_book.xls"
if(!file.exists(datFile)) {
download.file(url = "http://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/2013/01/nhefs_book.xls",
destfile = datFile)
}
nhefs <- readWorksheetFromFile(datFile, sheet = 1)
## Complete cases only
nhefs <- nhefs[!is.na(nhefs$death),]
dim(nhefs)
## [1] 1746 61
outcomeModel <- glm(death ~ qsmk + sex + age + race + smokeyrs, data = nhefs,
family = binomial(link = "logit"))
summary(outcomeModel)
##
## Call:
## glm(formula = death ~ qsmk + sex + age + race + smokeyrs, family = binomial(link = "logit"),
## data = nhefs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7308 -0.5944 -0.3096 -0.1732 2.9669
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.69921 0.40023 -16.738 < 2e-16 ***
## qsmk -0.01459 0.15739 -0.093 0.92615
## sex -0.46693 0.15231 -3.066 0.00217 **
## age 0.09111 0.01171 7.779 7.34e-15 ***
## race 0.43653 0.20381 2.142 0.03220 *
## smokeyrs 0.03526 0.01112 3.170 0.00152 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1707.3 on 1745 degrees of freedom
## Residual deviance: 1266.1 on 1740 degrees of freedom
## AIC: 1278.1
##
## Number of Fisher Scoring iterations: 5
## Create Stan data
dat <- list(N = nrow(nhefs),
p = 6,
death = nhefs$death,
qsmk = nhefs$qsmk,
sex = nhefs$sex,
age = nhefs$age,
race = nhefs$race,
smokeyrs = nhefs$smokeyrs)
## Load Stan file
fileName <- "./hernan_book_logistic.stan"
stan_code <- readChar(fileName, file.info(fileName)$size)
cat(stan_code)
## data {
## // Define variables in data
## // Number of observations (an integer)
## int<lower=0> N;
## // Number of parameters
## int<lower=0> p;
## // Variables
## int death[N];
## int<lower=0> qsmk[N];
## int<lower=0> sex[N];
## real<lower=0> age[N];
## int<lower=0> race[N];
## real<lower=0> smokeyrs[N];
## }
##
## parameters {
## // Define parameters to estimate
## real beta[p];
## }
##
## transformed parameters {
## // Probability trasformation from linear predictor
## real<lower=0> odds[N];
## real<lower=0, upper=1> prob[N];
##
## for (i in 1:N) {
## odds[i] <- exp(beta[1] + beta[2]*qsmk[i] + beta[3]*sex[i] + beta[4]*age[i] + beta[5]*race[i] + beta[6]*smokeyrs[i]);
## prob[i] <- odds[i] / (odds[i] + 1);
## }
## }
##
## model {
## // Prior part of Bayesian inference (flat if unspecified)
##
## // Likelihood part of Bayesian inference
## death ~ bernoulli(prob);
## }
## Run Stan
resStan <- stan(model_code = stan_code, data = dat,
chains = 3, iter = 3000, warmup = 500, thin = 10)
## Show traceplot
traceplot(resStan, pars = c("beta"), inc_warmup = TRUE)
## Bayesian
print(resStan, pars = c("beta"))
## Inference for Stan model: stan_code.
## 3 chains, each with iter=3000; warmup=500; thin=10;
## post-warmup draws per chain=250, total post-warmup draws=750.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta[1] -6.75 0.02 0.40 -7.50 -7.02 -6.74 -6.46 -5.98 712 1
## beta[2] -0.02 0.01 0.16 -0.33 -0.12 -0.01 0.09 0.28 750 1
## beta[3] -0.46 0.01 0.15 -0.73 -0.57 -0.46 -0.36 -0.16 668 1
## beta[4] 0.09 0.00 0.01 0.07 0.08 0.09 0.10 0.11 605 1
## beta[5] 0.43 0.01 0.21 0.02 0.29 0.42 0.57 0.84 750 1
## beta[6] 0.04 0.00 0.01 0.01 0.03 0.04 0.04 0.06 672 1
##
## Samples were drawn using NUTS(diag_e) at Tue Feb 24 04:47:18 2015.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
## Frequentist
tableone::ShowRegTable(outcomeModel, exp = FALSE)
## beta [confint] p
## (Intercept) -6.70 [-7.50, -5.93] <0.001
## qsmk -0.01 [-0.33, 0.29] 0.926
## sex -0.47 [-0.77, -0.17] 0.002
## age 0.09 [0.07, 0.11] <0.001
## race 0.44 [0.03, 0.83] 0.032
## smokeyrs 0.04 [0.01, 0.06] 0.002