sex, age, race, and smokeyrs are assumed to be sufficient to address confounding (conditional exchangeability)
age and smokeyrs meet linearity
library(magrittr)
library(XLConnect)
library(rstan)
datFile <- "nhefs_book.xls"
if(!file.exists(datFile)) {
download.file(url = "http://cdn1.sph.harvard.edu/wp-content/uploads/sites/1268/2013/01/nhefs_book.xls",
destfile = datFile)
}
nhefs <- readWorksheetFromFile(datFile, sheet = 1)
## Complete cases only
nhefs <- nhefs[!is.na(nhefs$wt82_71),]
dim(nhefs)
## [1] 1679 61
outcomeModel <- lm(wt82_71 ~ qsmk + sex + age + race + smokeyrs, data = nhefs)
summary(outcomeModel)
##
## Call:
## lm(formula = wt82_71 ~ qsmk + sex + age + race + smokeyrs, data = nhefs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.031 -3.957 -0.039 4.134 45.825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.94751 0.81219 12.248 < 2e-16 ***
## qsmk 2.95305 0.42228 6.993 3.87e-12 ***
## sex -0.09579 0.38366 -0.250 0.8029
## age -0.21764 0.03214 -6.772 1.75e-11 ***
## race 0.06203 0.53826 0.115 0.9083
## smokeyrs 0.06257 0.03252 1.924 0.0545 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.476 on 1673 degrees of freedom
## Multiple R-squared: 0.0814, Adjusted R-squared: 0.07865
## F-statistic: 29.65 on 5 and 1673 DF, p-value: < 2.2e-16
## Create Stan data
dat <- list(N = nrow(nhefs),
p = 6,
wt82_71 = nhefs$wt82_71,
qsmk = nhefs$qsmk,
sex = nhefs$sex,
age = nhefs$age,
race = nhefs$race,
smokeyrs = nhefs$smokeyrs)
## Load Stan file
fileName <- "./hernan_book_regression.stan"
stan_code <- readChar(fileName, file.info(fileName)$size)
cat(stan_code)
## data {
## // Define variables in data
## // Number of observations (an integer)
## int<lower=0> N;
## // Number of parameters
## int<lower=0> p;
## // Variables
## real wt82_71[N];
## int<lower=0> qsmk[N];
## int<lower=0> sex[N];
## real<lower=0> age[N];
## int<lower=0> race[N];
## real<lower=0> smokeyrs[N];
## }
##
## parameters {
## // Define parameters to estimate
## real beta[p];
##
## // standard deviation (a positive real number)
## real<lower=0> sigma;
## }
##
## transformed parameters {
## // Mean
## real mu[N];
## for (i in 1:N) {
## mu[i] <- beta[1] + beta[2]*qsmk[i] + beta[3]*sex[i] + beta[4]*age[i] + beta[5]*race[i] + beta[6]*smokeyrs[i];
## }
## }
##
## model {
## // Prior part of Bayesian inference (flat if unspecified)
##
## // Likelihood part of Bayesian inference
## wt82_71 ~ normal(mu, sigma);
## }
## Run Stan
resStan <- stan(model_code = stan_code, data = dat,
chains = 3, iter = 3000, warmup = 500, thin = 10)
## Show traceplot
traceplot(resStan, pars = c("beta","sigma"), inc_warmup = TRUE)
## Bayesian
print(resStan, pars = c("beta","sigma"))
## Inference for Stan model: stan_code.
## 3 chains, each with iter=3000; warmup=500; thin=10;
## post-warmup draws per chain=250, total post-warmup draws=750.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta[1] 9.93 0.03 0.81 8.35 9.37 9.91 10.52 11.46 709 1
## beta[2] 2.95 0.02 0.42 2.12 2.67 2.94 3.24 3.76 750 1
## beta[3] -0.11 0.01 0.38 -0.84 -0.37 -0.11 0.13 0.65 742 1
## beta[4] -0.22 0.00 0.03 -0.28 -0.24 -0.22 -0.19 -0.16 625 1
## beta[5] 0.07 0.02 0.55 -1.03 -0.25 0.05 0.40 1.27 732 1
## beta[6] 0.06 0.00 0.03 0.00 0.04 0.06 0.08 0.13 750 1
## sigma 7.48 0.00 0.13 7.24 7.39 7.48 7.57 7.76 750 1
##
## Samples were drawn using NUTS(diag_e) at Mon Feb 23 17:50:03 2015.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
## Frequentist
tableone::ShowRegTable(outcomeModel, exp = FALSE)
## beta [confint] p
## (Intercept) 9.95 [8.35, 11.54] <0.001
## qsmk 2.95 [2.12, 3.78] <0.001
## sex -0.10 [-0.85, 0.66] 0.803
## age -0.22 [-0.28, -0.15] <0.001
## race 0.06 [-0.99, 1.12] 0.908
## smokeyrs 0.06 [-0.00, 0.13] 0.055
summary(outcomeModel)$sigma
## [1] 7.475789