## ### Using 12 cores
## ### Using doParallelMC as backend
library(tidyverse)
library(rstanarm)
library(bayesplot)
‘roaches’ Data on the efficacy of a pest management system at
reducing the number of roaches in urban apartments.
Source: Gelman and Hill (2007)
262 obs. of 6 variables
• ‘y’ Number of roaches caught
• ‘roach1’ Pretreatment number of roaches
• ‘treatment’ Treatment indicator
• ‘senior’ Indicator for only eldery residents in building
• ‘exposure2’ Number of days for which the roach traps were
used
data(roaches)
roaches <- roaches %>%
## Rescale
mutate(roach1 = roach1 / 100)
The outcome y is a count outcome. The mean count will be modeled as a function of roach1, treatment, and senior. The exposure2, the number of days the traps were placed is considered the offset.
A Poisson model can be fit using stan_glm.
fit_stan_glm <- rstanarm::stan_glm(y ~ roach1 + treatment + senior + offset(exposure2),
family = poisson,
data = roaches,
## priors for beta0 and betas
prior_intercept = normal(0, 5),
prior = normal(0, 2.5))
We can check the model as follows.
## Check priors
rstanarm::prior_summary(fit_stan_glm)
## Priors for model 'fit_stan_glm'
## ------
## Intercept (after predictors centered)
## ~ normal(location = 0, scale = 5)
##
## Coefficients
## ~ normal(location = [0,0,0], scale = [2.5,2.5,2.5])
## **adjusted scale = [3.32,2.50,2.50]
## ------
## See help('prior_summary.stanreg') for more details
## Check fit
summary(fit_stan_glm)
##
## Model Info:
##
## function: stan_glm
## family: poisson [log]
## formula: y ~ roach1 + treatment + senior + offset(exposure2)
## algorithm: sampling
## priors: see help('prior_summary')
## sample: 4000 (posterior sample size)
## observations: 262
## predictors: 4
##
## Estimates:
## mean sd 2.5% 25% 50% 75% 97.5%
## (Intercept) 2.1 0.0 2.1 2.1 2.1 2.1 2.2
## roach1 0.7 0.0 0.7 0.7 0.7 0.7 0.7
## treatment -0.6 0.0 -0.7 -0.6 -0.6 -0.6 -0.6
## senior -0.7 0.0 -0.8 -0.7 -0.7 -0.7 -0.6
## mean_PPD 25.7 0.5 24.8 25.3 25.6 26.0 26.6
## log-posterior -6610.1 1.4 -6613.8 -6610.9 -6609.8 -6609.1 -6608.3
##
## Diagnostics:
## mcse Rhat n_eff
## (Intercept) 0.0 1.0 3927
## roach1 0.0 1.0 4000
## treatment 0.0 1.0 3008
## senior 0.0 1.0 3149
## mean_PPD 0.0 1.0 3118
## log-posterior 0.0 1.0 1628
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
## Check chains
plot(fit_stan_glm, "trace")
## Auto-correlation
bayesplot::mcmc_acf(as.matrix(fit_stan_glm))
## Posterior distribution
bayesplot::mcmc_areas(as.matrix(fit_stan_glm), prob = 0.95)
## Posterior predictive check of density
pp_check(fit_stan_glm)
## Posterior predictive check of histogram
pp_check(fit_stan_glm, plotfun = "hist")
The observed data have more zeros than in the posterior predictive distributions.
A negative binomial model can be fit using update.
fit_stan_glm2 <- update(fit_stan_glm, family = neg_binomial_2)
We can check the model again.
## Check priors
rstanarm::prior_summary(fit_stan_glm2)
## Priors for model 'fit_stan_glm2'
## ------
## Intercept (after predictors centered)
## ~ normal(location = 0, scale = 5)
##
## Coefficients
## ~ normal(location = [0,0,0], scale = [2.5,2.5,2.5])
## **adjusted scale = [3.32,2.50,2.50]
##
## Auxiliary (reciprocal_dispersion)
## ~ exponential(rate = 1)
## ------
## See help('prior_summary.stanreg') for more details
## Check fit
summary(fit_stan_glm2)
##
## Model Info:
##
## function: stan_glm
## family: neg_binomial_2 [log]
## formula: y ~ roach1 + treatment + senior + offset(exposure2)
## algorithm: sampling
## priors: see help('prior_summary')
## sample: 4000 (posterior sample size)
## observations: 262
## predictors: 4
##
## Estimates:
## mean sd 2.5% 25% 50% 75% 97.5%
## (Intercept) 1.8 0.2 1.4 1.7 1.8 2.0 2.3
## roach1 1.3 0.2 0.9 1.1 1.3 1.5 1.8
## treatment -0.8 0.3 -1.3 -0.9 -0.8 -0.6 -0.3
## senior -0.3 0.3 -0.8 -0.5 -0.3 -0.1 0.2
## reciprocal_dispersion 0.3 0.0 0.2 0.3 0.3 0.3 0.3
## mean_PPD 88.1 145.3 21.3 35.9 52.8 89.1 386.2
## log-posterior -900.3 1.6 -904.3 -901.2 -900.0 -899.1 -898.2
##
## Diagnostics:
## mcse Rhat n_eff
## (Intercept) 0.0 1.0 4000
## roach1 0.0 1.0 4000
## treatment 0.0 1.0 4000
## senior 0.0 1.0 4000
## reciprocal_dispersion 0.0 1.0 4000
## mean_PPD 2.7 1.0 2825
## log-posterior 0.0 1.0 1391
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
## Check chains
plot(fit_stan_glm2, "trace")
## Auto-correlation
bayesplot::mcmc_acf(as.matrix(fit_stan_glm2))
## Posterior distribution
bayesplot::mcmc_areas(as.matrix(fit_stan_glm2), prob = 0.95)
## Posterior predictive check of density
pp_check(fit_stan_glm2, xlim = c(0, 100))
## Warning: The following arguments were unrecognized and ignored: xlim
## Posterior predictive check of histogram
pp_check(fit_stan_glm2, plotfun = "hist")
The probability at
We can use the loo package to compare these models.
## loo: leave-one-out cross-validation approximation
loo_fit_stan_glm <- loo(fit_stan_glm, cores = n_cores)
## Warning: Found 17 observations with a pareto_k > 0.7. With this many problematic observations we recommend calling 'kfold' with argument 'K=10' to perform 10-fold cross-validation rather than LOO.
loo_fit_stan_glm2 <- loo(fit_stan_glm2, cores = n_cores)
compare_models(loo_fit_stan_glm, loo_fit_stan_glm2)
##
## Model comparison:
## (negative 'elpd_diff' favors 1st model, positive favors 2nd)
##
## elpd_diff se
## 5912.2 872.6
## K-fold cross validation (more time consuming)
kfold_fit_stan_glm <- kfold(fit_stan_glm)
kfold_fit_stan_glm2 <- kfold(fit_stan_glm2)
compare_models(kfold_fit_stan_glm, kfold_fit_stan_glm2)
##
## Model comparison:
## (negative 'elpd_diff' favors 1st model, positive favors 2nd)
##
## elpd_diff se
## 6193.7 973.5